#include <cuda.h>
#include <cuda_runtime.h>
#include "vhf.cuh"
#include "rys_roots_for_k.cu"
#include "create_tasks.cu"


#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_0000(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        
        gout0 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    gout0 += 1 * fac * wt;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_1000(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    gout0 += trr_10x * fac * wt;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += 1 * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += 1 * fac * trr_10z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_1010(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    gout0 += trr_11x * fac * wt;
                    double trr_01x = cpx * 1;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_01x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_01x * fac * trr_10z;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout3 += trr_10x * trr_01y * wt;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout4 += 1 * trr_11y * wt;
                    gout5 += 1 * trr_01y * trr_10z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout6 += trr_10x * fac * trr_01z;
                    gout7 += 1 * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout8 += 1 * fac * trr_11z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+0)*nao+(k0+1)];
                val += gout6 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                val += gout7 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+0)*nao+(k0+1)];
                val += gout8 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout3 * dm[(i0+0)*nao+(k0+1)];
                val += gout6 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout4 * dm[(i0+1)*nao+(k0+1)];
                val += gout7 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout5 * dm[(i0+2)*nao+(k0+1)];
                val += gout8 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_1011(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double b01 = .5/akl * (1 - rt_akl);
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double trr_01x = cpx * 1;
                    double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
                    double hrr_1011x = trr_12x - xlxk * trr_11x;
                    gout0 += hrr_1011x * fac * wt;
                    double trr_02x = cpx * trr_01x + 1*b01 * 1;
                    double hrr_0011x = trr_02x - xlxk * trr_01x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_0011x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_0011x * fac * trr_10z;
                    double hrr_1001x = trr_11x - xlxk * trr_10x;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout3 += hrr_1001x * trr_01y * wt;
                    double hrr_0001x = trr_01x - xlxk * 1;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout4 += hrr_0001x * trr_11y * wt;
                    gout5 += hrr_0001x * trr_01y * trr_10z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout6 += hrr_1001x * fac * trr_01z;
                    gout7 += hrr_0001x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout8 += hrr_0001x * fac * trr_11z;
                    double hrr_0001y = trr_01y - ylyk * fac;
                    gout9 += trr_11x * hrr_0001y * wt;
                    double hrr_1001y = trr_11y - ylyk * trr_10y;
                    gout10 += trr_01x * hrr_1001y * wt;
                    gout11 += trr_01x * hrr_0001y * trr_10z;
                    double trr_02y = cpy * trr_01y + 1*b01 * fac;
                    double hrr_0011y = trr_02y - ylyk * trr_01y;
                    gout12 += trr_10x * hrr_0011y * wt;
                    double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
                    double hrr_1011y = trr_12y - ylyk * trr_11y;
                    gout13 += 1 * hrr_1011y * wt;
                    gout14 += 1 * hrr_0011y * trr_10z;
                    gout15 += trr_10x * hrr_0001y * trr_01z;
                    gout16 += 1 * hrr_1001y * trr_01z;
                    gout17 += 1 * hrr_0001y * trr_11z;
                    double hrr_0001z = trr_01z - zlzk * wt;
                    gout18 += trr_11x * fac * hrr_0001z;
                    gout19 += trr_01x * trr_10y * hrr_0001z;
                    double hrr_1001z = trr_11z - zlzk * trr_10z;
                    gout20 += trr_01x * fac * hrr_1001z;
                    gout21 += trr_10x * trr_01y * hrr_0001z;
                    gout22 += 1 * trr_11y * hrr_0001z;
                    gout23 += 1 * trr_01y * hrr_1001z;
                    double trr_02z = cpz * trr_01z + 1*b01 * wt;
                    double hrr_0011z = trr_02z - zlzk * trr_01z;
                    gout24 += trr_10x * fac * hrr_0011z;
                    gout25 += 1 * trr_10y * hrr_0011z;
                    double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
                    double hrr_1011z = trr_12z - zlzk * trr_11z;
                    gout26 += 1 * fac * hrr_1011z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+0)*nao+(k0+1)];
                val += gout6 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+1)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+1)];
                val += gout24 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                val += gout7 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+1)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+0)];
                val += gout22 * dm[(j0+0)*nao+(k0+1)];
                val += gout25 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+0)*nao+(k0+1)];
                val += gout8 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+1)];
                val += gout17 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(k0+0)];
                val += gout23 * dm[(j0+0)*nao+(k0+1)];
                val += gout26 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout3 * dm[(i0+0)*nao+(k0+1)];
                val += gout6 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout4 * dm[(i0+1)*nao+(k0+1)];
                val += gout7 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout5 * dm[(i0+2)*nao+(k0+1)];
                val += gout8 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+1)];
                val += gout15 * dm[(i0+0)*nao+(k0+2)];
                val += gout10 * dm[(i0+1)*nao+(k0+0)];
                val += gout13 * dm[(i0+1)*nao+(k0+1)];
                val += gout16 * dm[(i0+1)*nao+(k0+2)];
                val += gout11 * dm[(i0+2)*nao+(k0+0)];
                val += gout14 * dm[(i0+2)*nao+(k0+1)];
                val += gout17 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(k0+0)];
                val += gout21 * dm[(i0+0)*nao+(k0+1)];
                val += gout24 * dm[(i0+0)*nao+(k0+2)];
                val += gout19 * dm[(i0+1)*nao+(k0+0)];
                val += gout22 * dm[(i0+1)*nao+(k0+1)];
                val += gout25 * dm[(i0+1)*nao+(k0+2)];
                val += gout20 * dm[(i0+2)*nao+(k0+0)];
                val += gout23 * dm[(i0+2)*nao+(k0+1)];
                val += gout26 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+1)];
                val += gout24 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+1)];
                val += gout25 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                val += gout23 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+1)];
                val += gout26 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout9 * dm[(i0+0)*nao+(l0+1)];
                val += gout18 * dm[(i0+0)*nao+(l0+2)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout10 * dm[(i0+1)*nao+(l0+1)];
                val += gout19 * dm[(i0+1)*nao+(l0+2)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout11 * dm[(i0+2)*nao+(l0+1)];
                val += gout20 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout12 * dm[(i0+0)*nao+(l0+1)];
                val += gout21 * dm[(i0+0)*nao+(l0+2)];
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+1)];
                val += gout22 * dm[(i0+1)*nao+(l0+2)];
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+1)];
                val += gout23 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout15 * dm[(i0+0)*nao+(l0+1)];
                val += gout24 * dm[(i0+0)*nao+(l0+2)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout16 * dm[(i0+1)*nao+(l0+1)];
                val += gout25 * dm[(i0+1)*nao+(l0+2)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout17 * dm[(i0+2)*nao+(l0+1)];
                val += gout26 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_1100(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    gout0 += hrr_1100x * fac * wt;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_0100x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_0100x * fac * trr_10z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout3 += trr_10x * hrr_0100y * wt;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout4 += 1 * hrr_1100y * wt;
                    gout5 += 1 * hrr_0100y * trr_10z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout6 += trr_10x * fac * hrr_0100z;
                    gout7 += 1 * trr_10y * hrr_0100z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout8 += 1 * fac * hrr_1100z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout7 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+1)*nao+(k0+0)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+0)];
                val += gout4 * dm[(i0+1)*nao+(k0+0)];
                val += gout5 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_1110(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double hrr_1110x = trr_21x - xjxi * trr_11x;
                    gout0 += hrr_1110x * fac * wt;
                    double trr_01x = cpx * 1;
                    double hrr_0110x = trr_11x - xjxi * trr_01x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_0110x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_0110x * fac * trr_10z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout3 += trr_11x * hrr_0100y * wt;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout4 += trr_01x * hrr_1100y * wt;
                    gout5 += trr_01x * hrr_0100y * trr_10z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout6 += trr_11x * fac * hrr_0100z;
                    gout7 += trr_01x * trr_10y * hrr_0100z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout8 += trr_01x * fac * hrr_1100z;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout9 += hrr_1100x * trr_01y * wt;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout10 += hrr_0100x * trr_11y * wt;
                    gout11 += hrr_0100x * trr_01y * trr_10z;
                    double hrr_0110y = trr_11y - yjyi * trr_01y;
                    gout12 += trr_10x * hrr_0110y * wt;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    double hrr_1110y = trr_21y - yjyi * trr_11y;
                    gout13 += 1 * hrr_1110y * wt;
                    gout14 += 1 * hrr_0110y * trr_10z;
                    gout15 += trr_10x * trr_01y * hrr_0100z;
                    gout16 += 1 * trr_11y * hrr_0100z;
                    gout17 += 1 * trr_01y * hrr_1100z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout18 += hrr_1100x * fac * trr_01z;
                    gout19 += hrr_0100x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout20 += hrr_0100x * fac * trr_11z;
                    gout21 += trr_10x * hrr_0100y * trr_01z;
                    gout22 += 1 * hrr_1100y * trr_01z;
                    gout23 += 1 * hrr_0100y * trr_11z;
                    double hrr_0110z = trr_11z - zjzi * trr_01z;
                    gout24 += trr_10x * fac * hrr_0110z;
                    gout25 += 1 * trr_10y * hrr_0110z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    double hrr_1110z = trr_21z - zjzi * trr_11z;
                    gout26 += 1 * fac * hrr_1110z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout12 * dm[(j0+1)*nao+(k0+1)];
                val += gout21 * dm[(j0+1)*nao+(k0+2)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                val += gout15 * dm[(j0+2)*nao+(k0+1)];
                val += gout24 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+0)*nao+(k0+2)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout13 * dm[(j0+1)*nao+(k0+1)];
                val += gout22 * dm[(j0+1)*nao+(k0+2)];
                val += gout7 * dm[(j0+2)*nao+(k0+0)];
                val += gout16 * dm[(j0+2)*nao+(k0+1)];
                val += gout25 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                val += gout5 * dm[(j0+1)*nao+(k0+0)];
                val += gout14 * dm[(j0+1)*nao+(k0+1)];
                val += gout23 * dm[(j0+1)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                val += gout26 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout9 * dm[(i0+0)*nao+(k0+1)];
                val += gout18 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout19 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout11 * dm[(i0+2)*nao+(k0+1)];
                val += gout20 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+1)];
                val += gout21 * dm[(i0+0)*nao+(k0+2)];
                val += gout4 * dm[(i0+1)*nao+(k0+0)];
                val += gout13 * dm[(i0+1)*nao+(k0+1)];
                val += gout22 * dm[(i0+1)*nao+(k0+2)];
                val += gout5 * dm[(i0+2)*nao+(k0+0)];
                val += gout14 * dm[(i0+2)*nao+(k0+1)];
                val += gout23 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout15 * dm[(i0+0)*nao+(k0+1)];
                val += gout24 * dm[(i0+0)*nao+(k0+2)];
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout16 * dm[(i0+1)*nao+(k0+1)];
                val += gout25 * dm[(i0+1)*nao+(k0+2)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout17 * dm[(i0+2)*nao+(k0+1)];
                val += gout26 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+1)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+1)*nao+(l0+0)];
                val += gout24 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+1)*nao+(l0+0)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+1)*nao+(l0+0)];
                val += gout25 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+1)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(l0+0)];
                val += gout23 * dm[(j0+1)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(l0+0)];
                val += gout10 * dm[(i0+1)*nao+(l0+0)];
                val += gout11 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(i0+0)*nao+(l0+0)];
                val += gout22 * dm[(i0+1)*nao+(l0+0)];
                val += gout23 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+0)];
                val += gout16 * dm[(i0+1)*nao+(l0+0)];
                val += gout17 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_1111(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        double gout36;
        double gout37;
        double gout38;
        double gout39;
        double gout40;
        double gout41;
        double gout42;
        double gout43;
        double gout44;
        double gout45;
        double gout46;
        double gout47;
        double gout48;
        double gout49;
        double gout50;
        double gout51;
        double gout52;
        double gout53;
        double gout54;
        double gout55;
        double gout56;
        double gout57;
        double gout58;
        double gout59;
        double gout60;
        double gout61;
        double gout62;
        double gout63;
        double gout64;
        double gout65;
        double gout66;
        double gout67;
        double gout68;
        double gout69;
        double gout70;
        double gout71;
        double gout72;
        double gout73;
        double gout74;
        double gout75;
        double gout76;
        double gout77;
        double gout78;
        double gout79;
        double gout80;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        gout36 = 0;
        gout37 = 0;
        gout38 = 0;
        gout39 = 0;
        gout40 = 0;
        gout41 = 0;
        gout42 = 0;
        gout43 = 0;
        gout44 = 0;
        gout45 = 0;
        gout46 = 0;
        gout47 = 0;
        gout48 = 0;
        gout49 = 0;
        gout50 = 0;
        gout51 = 0;
        gout52 = 0;
        gout53 = 0;
        gout54 = 0;
        gout55 = 0;
        gout56 = 0;
        gout57 = 0;
        gout58 = 0;
        gout59 = 0;
        gout60 = 0;
        gout61 = 0;
        gout62 = 0;
        gout63 = 0;
        gout64 = 0;
        gout65 = 0;
        gout66 = 0;
        gout67 = 0;
        gout68 = 0;
        gout69 = 0;
        gout70 = 0;
        gout71 = 0;
        gout72 = 0;
        gout73 = 0;
        gout74 = 0;
        gout75 = 0;
        gout76 = 0;
        gout77 = 0;
        gout78 = 0;
        gout79 = 0;
        gout80 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double b01 = .5/akl * (1 - rt_akl);
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
                    double hrr_2011x = trr_22x - xlxk * trr_21x;
                    double trr_01x = cpx * 1;
                    double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
                    double hrr_1011x = trr_12x - xlxk * trr_11x;
                    double hrr_1111x = hrr_2011x - xjxi * hrr_1011x;
                    gout0 += hrr_1111x * fac * wt;
                    double trr_02x = cpx * trr_01x + 1*b01 * 1;
                    double hrr_0011x = trr_02x - xlxk * trr_01x;
                    double hrr_0111x = hrr_1011x - xjxi * hrr_0011x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_0111x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_0111x * fac * trr_10z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout3 += hrr_1011x * hrr_0100y * wt;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout4 += hrr_0011x * hrr_1100y * wt;
                    gout5 += hrr_0011x * hrr_0100y * trr_10z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout6 += hrr_1011x * fac * hrr_0100z;
                    gout7 += hrr_0011x * trr_10y * hrr_0100z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout8 += hrr_0011x * fac * hrr_1100z;
                    double hrr_2001x = trr_21x - xlxk * trr_20x;
                    double hrr_1001x = trr_11x - xlxk * trr_10x;
                    double hrr_1101x = hrr_2001x - xjxi * hrr_1001x;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout9 += hrr_1101x * trr_01y * wt;
                    double hrr_0001x = trr_01x - xlxk * 1;
                    double hrr_0101x = hrr_1001x - xjxi * hrr_0001x;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout10 += hrr_0101x * trr_11y * wt;
                    gout11 += hrr_0101x * trr_01y * trr_10z;
                    double hrr_0110y = trr_11y - yjyi * trr_01y;
                    gout12 += hrr_1001x * hrr_0110y * wt;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    double hrr_1110y = trr_21y - yjyi * trr_11y;
                    gout13 += hrr_0001x * hrr_1110y * wt;
                    gout14 += hrr_0001x * hrr_0110y * trr_10z;
                    gout15 += hrr_1001x * trr_01y * hrr_0100z;
                    gout16 += hrr_0001x * trr_11y * hrr_0100z;
                    gout17 += hrr_0001x * trr_01y * hrr_1100z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout18 += hrr_1101x * fac * trr_01z;
                    gout19 += hrr_0101x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout20 += hrr_0101x * fac * trr_11z;
                    gout21 += hrr_1001x * hrr_0100y * trr_01z;
                    gout22 += hrr_0001x * hrr_1100y * trr_01z;
                    gout23 += hrr_0001x * hrr_0100y * trr_11z;
                    double hrr_0110z = trr_11z - zjzi * trr_01z;
                    gout24 += hrr_1001x * fac * hrr_0110z;
                    gout25 += hrr_0001x * trr_10y * hrr_0110z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    double hrr_1110z = trr_21z - zjzi * trr_11z;
                    gout26 += hrr_0001x * fac * hrr_1110z;
                    double hrr_1110x = trr_21x - xjxi * trr_11x;
                    double hrr_0001y = trr_01y - ylyk * fac;
                    gout27 += hrr_1110x * hrr_0001y * wt;
                    double hrr_0110x = trr_11x - xjxi * trr_01x;
                    double hrr_1001y = trr_11y - ylyk * trr_10y;
                    gout28 += hrr_0110x * hrr_1001y * wt;
                    gout29 += hrr_0110x * hrr_0001y * trr_10z;
                    double hrr_0101y = hrr_1001y - yjyi * hrr_0001y;
                    gout30 += trr_11x * hrr_0101y * wt;
                    double hrr_2001y = trr_21y - ylyk * trr_20y;
                    double hrr_1101y = hrr_2001y - yjyi * hrr_1001y;
                    gout31 += trr_01x * hrr_1101y * wt;
                    gout32 += trr_01x * hrr_0101y * trr_10z;
                    gout33 += trr_11x * hrr_0001y * hrr_0100z;
                    gout34 += trr_01x * hrr_1001y * hrr_0100z;
                    gout35 += trr_01x * hrr_0001y * hrr_1100z;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double trr_02y = cpy * trr_01y + 1*b01 * fac;
                    double hrr_0011y = trr_02y - ylyk * trr_01y;
                    gout36 += hrr_1100x * hrr_0011y * wt;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
                    double hrr_1011y = trr_12y - ylyk * trr_11y;
                    gout37 += hrr_0100x * hrr_1011y * wt;
                    gout38 += hrr_0100x * hrr_0011y * trr_10z;
                    double hrr_0111y = hrr_1011y - yjyi * hrr_0011y;
                    gout39 += trr_10x * hrr_0111y * wt;
                    double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
                    double hrr_2011y = trr_22y - ylyk * trr_21y;
                    double hrr_1111y = hrr_2011y - yjyi * hrr_1011y;
                    gout40 += 1 * hrr_1111y * wt;
                    gout41 += 1 * hrr_0111y * trr_10z;
                    gout42 += trr_10x * hrr_0011y * hrr_0100z;
                    gout43 += 1 * hrr_1011y * hrr_0100z;
                    gout44 += 1 * hrr_0011y * hrr_1100z;
                    gout45 += hrr_1100x * hrr_0001y * trr_01z;
                    gout46 += hrr_0100x * hrr_1001y * trr_01z;
                    gout47 += hrr_0100x * hrr_0001y * trr_11z;
                    gout48 += trr_10x * hrr_0101y * trr_01z;
                    gout49 += 1 * hrr_1101y * trr_01z;
                    gout50 += 1 * hrr_0101y * trr_11z;
                    gout51 += trr_10x * hrr_0001y * hrr_0110z;
                    gout52 += 1 * hrr_1001y * hrr_0110z;
                    gout53 += 1 * hrr_0001y * hrr_1110z;
                    double hrr_0001z = trr_01z - zlzk * wt;
                    gout54 += hrr_1110x * fac * hrr_0001z;
                    gout55 += hrr_0110x * trr_10y * hrr_0001z;
                    double hrr_1001z = trr_11z - zlzk * trr_10z;
                    gout56 += hrr_0110x * fac * hrr_1001z;
                    gout57 += trr_11x * hrr_0100y * hrr_0001z;
                    gout58 += trr_01x * hrr_1100y * hrr_0001z;
                    gout59 += trr_01x * hrr_0100y * hrr_1001z;
                    double hrr_0101z = hrr_1001z - zjzi * hrr_0001z;
                    gout60 += trr_11x * fac * hrr_0101z;
                    gout61 += trr_01x * trr_10y * hrr_0101z;
                    double hrr_2001z = trr_21z - zlzk * trr_20z;
                    double hrr_1101z = hrr_2001z - zjzi * hrr_1001z;
                    gout62 += trr_01x * fac * hrr_1101z;
                    gout63 += hrr_1100x * trr_01y * hrr_0001z;
                    gout64 += hrr_0100x * trr_11y * hrr_0001z;
                    gout65 += hrr_0100x * trr_01y * hrr_1001z;
                    gout66 += trr_10x * hrr_0110y * hrr_0001z;
                    gout67 += 1 * hrr_1110y * hrr_0001z;
                    gout68 += 1 * hrr_0110y * hrr_1001z;
                    gout69 += trr_10x * trr_01y * hrr_0101z;
                    gout70 += 1 * trr_11y * hrr_0101z;
                    gout71 += 1 * trr_01y * hrr_1101z;
                    double trr_02z = cpz * trr_01z + 1*b01 * wt;
                    double hrr_0011z = trr_02z - zlzk * trr_01z;
                    gout72 += hrr_1100x * fac * hrr_0011z;
                    gout73 += hrr_0100x * trr_10y * hrr_0011z;
                    double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
                    double hrr_1011z = trr_12z - zlzk * trr_11z;
                    gout74 += hrr_0100x * fac * hrr_1011z;
                    gout75 += trr_10x * hrr_0100y * hrr_0011z;
                    gout76 += 1 * hrr_1100y * hrr_0011z;
                    gout77 += 1 * hrr_0100y * hrr_1011z;
                    double hrr_0111z = hrr_1011z - zjzi * hrr_0011z;
                    gout78 += trr_10x * fac * hrr_0111z;
                    gout79 += 1 * trr_10y * hrr_0111z;
                    double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
                    double hrr_2011z = trr_22z - zlzk * trr_21z;
                    double hrr_1111z = hrr_2011z - zjzi * hrr_1011z;
                    gout80 += 1 * fac * hrr_1111z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout12 * dm[(j0+1)*nao+(k0+1)];
                val += gout21 * dm[(j0+1)*nao+(k0+2)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                val += gout15 * dm[(j0+2)*nao+(k0+1)];
                val += gout24 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout27 * dm[(j0+0)*nao+(k0+0)];
                val += gout36 * dm[(j0+0)*nao+(k0+1)];
                val += gout45 * dm[(j0+0)*nao+(k0+2)];
                val += gout30 * dm[(j0+1)*nao+(k0+0)];
                val += gout39 * dm[(j0+1)*nao+(k0+1)];
                val += gout48 * dm[(j0+1)*nao+(k0+2)];
                val += gout33 * dm[(j0+2)*nao+(k0+0)];
                val += gout42 * dm[(j0+2)*nao+(k0+1)];
                val += gout51 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout54 * dm[(j0+0)*nao+(k0+0)];
                val += gout63 * dm[(j0+0)*nao+(k0+1)];
                val += gout72 * dm[(j0+0)*nao+(k0+2)];
                val += gout57 * dm[(j0+1)*nao+(k0+0)];
                val += gout66 * dm[(j0+1)*nao+(k0+1)];
                val += gout75 * dm[(j0+1)*nao+(k0+2)];
                val += gout60 * dm[(j0+2)*nao+(k0+0)];
                val += gout69 * dm[(j0+2)*nao+(k0+1)];
                val += gout78 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+0)*nao+(k0+2)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout13 * dm[(j0+1)*nao+(k0+1)];
                val += gout22 * dm[(j0+1)*nao+(k0+2)];
                val += gout7 * dm[(j0+2)*nao+(k0+0)];
                val += gout16 * dm[(j0+2)*nao+(k0+1)];
                val += gout25 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout28 * dm[(j0+0)*nao+(k0+0)];
                val += gout37 * dm[(j0+0)*nao+(k0+1)];
                val += gout46 * dm[(j0+0)*nao+(k0+2)];
                val += gout31 * dm[(j0+1)*nao+(k0+0)];
                val += gout40 * dm[(j0+1)*nao+(k0+1)];
                val += gout49 * dm[(j0+1)*nao+(k0+2)];
                val += gout34 * dm[(j0+2)*nao+(k0+0)];
                val += gout43 * dm[(j0+2)*nao+(k0+1)];
                val += gout52 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout55 * dm[(j0+0)*nao+(k0+0)];
                val += gout64 * dm[(j0+0)*nao+(k0+1)];
                val += gout73 * dm[(j0+0)*nao+(k0+2)];
                val += gout58 * dm[(j0+1)*nao+(k0+0)];
                val += gout67 * dm[(j0+1)*nao+(k0+1)];
                val += gout76 * dm[(j0+1)*nao+(k0+2)];
                val += gout61 * dm[(j0+2)*nao+(k0+0)];
                val += gout70 * dm[(j0+2)*nao+(k0+1)];
                val += gout79 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                val += gout5 * dm[(j0+1)*nao+(k0+0)];
                val += gout14 * dm[(j0+1)*nao+(k0+1)];
                val += gout23 * dm[(j0+1)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                val += gout26 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout29 * dm[(j0+0)*nao+(k0+0)];
                val += gout38 * dm[(j0+0)*nao+(k0+1)];
                val += gout47 * dm[(j0+0)*nao+(k0+2)];
                val += gout32 * dm[(j0+1)*nao+(k0+0)];
                val += gout41 * dm[(j0+1)*nao+(k0+1)];
                val += gout50 * dm[(j0+1)*nao+(k0+2)];
                val += gout35 * dm[(j0+2)*nao+(k0+0)];
                val += gout44 * dm[(j0+2)*nao+(k0+1)];
                val += gout53 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout56 * dm[(j0+0)*nao+(k0+0)];
                val += gout65 * dm[(j0+0)*nao+(k0+1)];
                val += gout74 * dm[(j0+0)*nao+(k0+2)];
                val += gout59 * dm[(j0+1)*nao+(k0+0)];
                val += gout68 * dm[(j0+1)*nao+(k0+1)];
                val += gout77 * dm[(j0+1)*nao+(k0+2)];
                val += gout62 * dm[(j0+2)*nao+(k0+0)];
                val += gout71 * dm[(j0+2)*nao+(k0+1)];
                val += gout80 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout9 * dm[(i0+0)*nao+(k0+1)];
                val += gout18 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout19 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout11 * dm[(i0+2)*nao+(k0+1)];
                val += gout20 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout27 * dm[(i0+0)*nao+(k0+0)];
                val += gout36 * dm[(i0+0)*nao+(k0+1)];
                val += gout45 * dm[(i0+0)*nao+(k0+2)];
                val += gout28 * dm[(i0+1)*nao+(k0+0)];
                val += gout37 * dm[(i0+1)*nao+(k0+1)];
                val += gout46 * dm[(i0+1)*nao+(k0+2)];
                val += gout29 * dm[(i0+2)*nao+(k0+0)];
                val += gout38 * dm[(i0+2)*nao+(k0+1)];
                val += gout47 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout54 * dm[(i0+0)*nao+(k0+0)];
                val += gout63 * dm[(i0+0)*nao+(k0+1)];
                val += gout72 * dm[(i0+0)*nao+(k0+2)];
                val += gout55 * dm[(i0+1)*nao+(k0+0)];
                val += gout64 * dm[(i0+1)*nao+(k0+1)];
                val += gout73 * dm[(i0+1)*nao+(k0+2)];
                val += gout56 * dm[(i0+2)*nao+(k0+0)];
                val += gout65 * dm[(i0+2)*nao+(k0+1)];
                val += gout74 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+1)];
                val += gout21 * dm[(i0+0)*nao+(k0+2)];
                val += gout4 * dm[(i0+1)*nao+(k0+0)];
                val += gout13 * dm[(i0+1)*nao+(k0+1)];
                val += gout22 * dm[(i0+1)*nao+(k0+2)];
                val += gout5 * dm[(i0+2)*nao+(k0+0)];
                val += gout14 * dm[(i0+2)*nao+(k0+1)];
                val += gout23 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(k0+0)];
                val += gout39 * dm[(i0+0)*nao+(k0+1)];
                val += gout48 * dm[(i0+0)*nao+(k0+2)];
                val += gout31 * dm[(i0+1)*nao+(k0+0)];
                val += gout40 * dm[(i0+1)*nao+(k0+1)];
                val += gout49 * dm[(i0+1)*nao+(k0+2)];
                val += gout32 * dm[(i0+2)*nao+(k0+0)];
                val += gout41 * dm[(i0+2)*nao+(k0+1)];
                val += gout50 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout57 * dm[(i0+0)*nao+(k0+0)];
                val += gout66 * dm[(i0+0)*nao+(k0+1)];
                val += gout75 * dm[(i0+0)*nao+(k0+2)];
                val += gout58 * dm[(i0+1)*nao+(k0+0)];
                val += gout67 * dm[(i0+1)*nao+(k0+1)];
                val += gout76 * dm[(i0+1)*nao+(k0+2)];
                val += gout59 * dm[(i0+2)*nao+(k0+0)];
                val += gout68 * dm[(i0+2)*nao+(k0+1)];
                val += gout77 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout15 * dm[(i0+0)*nao+(k0+1)];
                val += gout24 * dm[(i0+0)*nao+(k0+2)];
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout16 * dm[(i0+1)*nao+(k0+1)];
                val += gout25 * dm[(i0+1)*nao+(k0+2)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout17 * dm[(i0+2)*nao+(k0+1)];
                val += gout26 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout33 * dm[(i0+0)*nao+(k0+0)];
                val += gout42 * dm[(i0+0)*nao+(k0+1)];
                val += gout51 * dm[(i0+0)*nao+(k0+2)];
                val += gout34 * dm[(i0+1)*nao+(k0+0)];
                val += gout43 * dm[(i0+1)*nao+(k0+1)];
                val += gout52 * dm[(i0+1)*nao+(k0+2)];
                val += gout35 * dm[(i0+2)*nao+(k0+0)];
                val += gout44 * dm[(i0+2)*nao+(k0+1)];
                val += gout53 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout60 * dm[(i0+0)*nao+(k0+0)];
                val += gout69 * dm[(i0+0)*nao+(k0+1)];
                val += gout78 * dm[(i0+0)*nao+(k0+2)];
                val += gout61 * dm[(i0+1)*nao+(k0+0)];
                val += gout70 * dm[(i0+1)*nao+(k0+1)];
                val += gout79 * dm[(i0+1)*nao+(k0+2)];
                val += gout62 * dm[(i0+2)*nao+(k0+0)];
                val += gout71 * dm[(i0+2)*nao+(k0+1)];
                val += gout80 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout27 * dm[(j0+0)*nao+(l0+1)];
                val += gout54 * dm[(j0+0)*nao+(l0+2)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                val += gout30 * dm[(j0+1)*nao+(l0+1)];
                val += gout57 * dm[(j0+1)*nao+(l0+2)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                val += gout33 * dm[(j0+2)*nao+(l0+1)];
                val += gout60 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout36 * dm[(j0+0)*nao+(l0+1)];
                val += gout63 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+0)];
                val += gout39 * dm[(j0+1)*nao+(l0+1)];
                val += gout66 * dm[(j0+1)*nao+(l0+2)];
                val += gout15 * dm[(j0+2)*nao+(l0+0)];
                val += gout42 * dm[(j0+2)*nao+(l0+1)];
                val += gout69 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout45 * dm[(j0+0)*nao+(l0+1)];
                val += gout72 * dm[(j0+0)*nao+(l0+2)];
                val += gout21 * dm[(j0+1)*nao+(l0+0)];
                val += gout48 * dm[(j0+1)*nao+(l0+1)];
                val += gout75 * dm[(j0+1)*nao+(l0+2)];
                val += gout24 * dm[(j0+2)*nao+(l0+0)];
                val += gout51 * dm[(j0+2)*nao+(l0+1)];
                val += gout78 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout28 * dm[(j0+0)*nao+(l0+1)];
                val += gout55 * dm[(j0+0)*nao+(l0+2)];
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                val += gout31 * dm[(j0+1)*nao+(l0+1)];
                val += gout58 * dm[(j0+1)*nao+(l0+2)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                val += gout34 * dm[(j0+2)*nao+(l0+1)];
                val += gout61 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout37 * dm[(j0+0)*nao+(l0+1)];
                val += gout64 * dm[(j0+0)*nao+(l0+2)];
                val += gout13 * dm[(j0+1)*nao+(l0+0)];
                val += gout40 * dm[(j0+1)*nao+(l0+1)];
                val += gout67 * dm[(j0+1)*nao+(l0+2)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                val += gout43 * dm[(j0+2)*nao+(l0+1)];
                val += gout70 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout46 * dm[(j0+0)*nao+(l0+1)];
                val += gout73 * dm[(j0+0)*nao+(l0+2)];
                val += gout22 * dm[(j0+1)*nao+(l0+0)];
                val += gout49 * dm[(j0+1)*nao+(l0+1)];
                val += gout76 * dm[(j0+1)*nao+(l0+2)];
                val += gout25 * dm[(j0+2)*nao+(l0+0)];
                val += gout52 * dm[(j0+2)*nao+(l0+1)];
                val += gout79 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout29 * dm[(j0+0)*nao+(l0+1)];
                val += gout56 * dm[(j0+0)*nao+(l0+2)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                val += gout32 * dm[(j0+1)*nao+(l0+1)];
                val += gout59 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                val += gout35 * dm[(j0+2)*nao+(l0+1)];
                val += gout62 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                val += gout38 * dm[(j0+0)*nao+(l0+1)];
                val += gout65 * dm[(j0+0)*nao+(l0+2)];
                val += gout14 * dm[(j0+1)*nao+(l0+0)];
                val += gout41 * dm[(j0+1)*nao+(l0+1)];
                val += gout68 * dm[(j0+1)*nao+(l0+2)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                val += gout44 * dm[(j0+2)*nao+(l0+1)];
                val += gout71 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(l0+0)];
                val += gout47 * dm[(j0+0)*nao+(l0+1)];
                val += gout74 * dm[(j0+0)*nao+(l0+2)];
                val += gout23 * dm[(j0+1)*nao+(l0+0)];
                val += gout50 * dm[(j0+1)*nao+(l0+1)];
                val += gout77 * dm[(j0+1)*nao+(l0+2)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                val += gout53 * dm[(j0+2)*nao+(l0+1)];
                val += gout80 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout27 * dm[(i0+0)*nao+(l0+1)];
                val += gout54 * dm[(i0+0)*nao+(l0+2)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout28 * dm[(i0+1)*nao+(l0+1)];
                val += gout55 * dm[(i0+1)*nao+(l0+2)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout29 * dm[(i0+2)*nao+(l0+1)];
                val += gout56 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(l0+0)];
                val += gout36 * dm[(i0+0)*nao+(l0+1)];
                val += gout63 * dm[(i0+0)*nao+(l0+2)];
                val += gout10 * dm[(i0+1)*nao+(l0+0)];
                val += gout37 * dm[(i0+1)*nao+(l0+1)];
                val += gout64 * dm[(i0+1)*nao+(l0+2)];
                val += gout11 * dm[(i0+2)*nao+(l0+0)];
                val += gout38 * dm[(i0+2)*nao+(l0+1)];
                val += gout65 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout45 * dm[(i0+0)*nao+(l0+1)];
                val += gout72 * dm[(i0+0)*nao+(l0+2)];
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout46 * dm[(i0+1)*nao+(l0+1)];
                val += gout73 * dm[(i0+1)*nao+(l0+2)];
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                val += gout47 * dm[(i0+2)*nao+(l0+1)];
                val += gout74 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout30 * dm[(i0+0)*nao+(l0+1)];
                val += gout57 * dm[(i0+0)*nao+(l0+2)];
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+1)];
                val += gout58 * dm[(i0+1)*nao+(l0+2)];
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+1)];
                val += gout59 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout39 * dm[(i0+0)*nao+(l0+1)];
                val += gout66 * dm[(i0+0)*nao+(l0+2)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout40 * dm[(i0+1)*nao+(l0+1)];
                val += gout67 * dm[(i0+1)*nao+(l0+2)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout41 * dm[(i0+2)*nao+(l0+1)];
                val += gout68 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(i0+0)*nao+(l0+0)];
                val += gout48 * dm[(i0+0)*nao+(l0+1)];
                val += gout75 * dm[(i0+0)*nao+(l0+2)];
                val += gout22 * dm[(i0+1)*nao+(l0+0)];
                val += gout49 * dm[(i0+1)*nao+(l0+1)];
                val += gout76 * dm[(i0+1)*nao+(l0+2)];
                val += gout23 * dm[(i0+2)*nao+(l0+0)];
                val += gout50 * dm[(i0+2)*nao+(l0+1)];
                val += gout77 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout33 * dm[(i0+0)*nao+(l0+1)];
                val += gout60 * dm[(i0+0)*nao+(l0+2)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout34 * dm[(i0+1)*nao+(l0+1)];
                val += gout61 * dm[(i0+1)*nao+(l0+2)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout35 * dm[(i0+2)*nao+(l0+1)];
                val += gout62 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+0)];
                val += gout42 * dm[(i0+0)*nao+(l0+1)];
                val += gout69 * dm[(i0+0)*nao+(l0+2)];
                val += gout16 * dm[(i0+1)*nao+(l0+0)];
                val += gout43 * dm[(i0+1)*nao+(l0+1)];
                val += gout70 * dm[(i0+1)*nao+(l0+2)];
                val += gout17 * dm[(i0+2)*nao+(l0+0)];
                val += gout44 * dm[(i0+2)*nao+(l0+1)];
                val += gout71 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout51 * dm[(i0+0)*nao+(l0+1)];
                val += gout78 * dm[(i0+0)*nao+(l0+2)];
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout52 * dm[(i0+1)*nao+(l0+1)];
                val += gout79 * dm[(i0+1)*nao+(l0+2)];
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                val += gout53 * dm[(i0+2)*nao+(l0+1)];
                val += gout80 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_2000(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    gout0 += trr_20x * fac * wt;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_10x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_10x * fac * trr_10z;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += 1 * trr_20y * wt;
                    gout4 += 1 * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += 1 * fac * trr_20z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_2010(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    gout0 += trr_21x * fac * wt;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_11x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_11x * fac * trr_10z;
                    double trr_01x = cpx * 1;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += trr_01x * trr_20y * wt;
                    gout4 += trr_01x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += trr_01x * fac * trr_20z;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout6 += trr_20x * trr_01y * wt;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout7 += trr_10x * trr_11y * wt;
                    gout8 += trr_10x * trr_01y * trr_10z;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    gout9 += 1 * trr_21y * wt;
                    gout10 += 1 * trr_11y * trr_10z;
                    gout11 += 1 * trr_01y * trr_20z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout12 += trr_20x * fac * trr_01z;
                    gout13 += trr_10x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout14 += trr_10x * fac * trr_11z;
                    gout15 += 1 * trr_20y * trr_01z;
                    gout16 += 1 * trr_10y * trr_11z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    gout17 += 1 * fac * trr_21z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout17 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout6 * dm[(i0+0)*nao+(k0+1)];
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout7 * dm[(i0+1)*nao+(k0+1)];
                val += gout13 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout8 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+2)*nao+(k0+2)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+1)];
                val += gout15 * dm[(i0+3)*nao+(k0+2)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+1)];
                val += gout16 * dm[(i0+4)*nao+(k0+2)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout17 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2011(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        double gout36;
        double gout37;
        double gout38;
        double gout39;
        double gout40;
        double gout41;
        double gout42;
        double gout43;
        double gout44;
        double gout45;
        double gout46;
        double gout47;
        double gout48;
        double gout49;
        double gout50;
        double gout51;
        double gout52;
        double gout53;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        gout36 = 0;
        gout37 = 0;
        gout38 = 0;
        gout39 = 0;
        gout40 = 0;
        gout41 = 0;
        gout42 = 0;
        gout43 = 0;
        gout44 = 0;
        gout45 = 0;
        gout46 = 0;
        gout47 = 0;
        gout48 = 0;
        gout49 = 0;
        gout50 = 0;
        gout51 = 0;
        gout52 = 0;
        gout53 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double b01 = .5/akl * (1 - rt_akl);
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
                    double hrr_2011x = trr_22x - xlxk * trr_21x;
                    gout0 += hrr_2011x * fac * wt;
                    double trr_01x = cpx * 1;
                    double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
                    double hrr_1011x = trr_12x - xlxk * trr_11x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_1011x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_1011x * fac * trr_10z;
                    double trr_02x = cpx * trr_01x + 1*b01 * 1;
                    double hrr_0011x = trr_02x - xlxk * trr_01x;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += hrr_0011x * trr_20y * wt;
                    gout4 += hrr_0011x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += hrr_0011x * fac * trr_20z;
                    double hrr_2001x = trr_21x - xlxk * trr_20x;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout6 += hrr_2001x * trr_01y * wt;
                    double hrr_1001x = trr_11x - xlxk * trr_10x;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout7 += hrr_1001x * trr_11y * wt;
                    gout8 += hrr_1001x * trr_01y * trr_10z;
                    double hrr_0001x = trr_01x - xlxk * 1;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    gout9 += hrr_0001x * trr_21y * wt;
                    gout10 += hrr_0001x * trr_11y * trr_10z;
                    gout11 += hrr_0001x * trr_01y * trr_20z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout12 += hrr_2001x * fac * trr_01z;
                    gout13 += hrr_1001x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout14 += hrr_1001x * fac * trr_11z;
                    gout15 += hrr_0001x * trr_20y * trr_01z;
                    gout16 += hrr_0001x * trr_10y * trr_11z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    gout17 += hrr_0001x * fac * trr_21z;
                    double hrr_0001y = trr_01y - ylyk * fac;
                    gout18 += trr_21x * hrr_0001y * wt;
                    double hrr_1001y = trr_11y - ylyk * trr_10y;
                    gout19 += trr_11x * hrr_1001y * wt;
                    gout20 += trr_11x * hrr_0001y * trr_10z;
                    double hrr_2001y = trr_21y - ylyk * trr_20y;
                    gout21 += trr_01x * hrr_2001y * wt;
                    gout22 += trr_01x * hrr_1001y * trr_10z;
                    gout23 += trr_01x * hrr_0001y * trr_20z;
                    double trr_02y = cpy * trr_01y + 1*b01 * fac;
                    double hrr_0011y = trr_02y - ylyk * trr_01y;
                    gout24 += trr_20x * hrr_0011y * wt;
                    double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
                    double hrr_1011y = trr_12y - ylyk * trr_11y;
                    gout25 += trr_10x * hrr_1011y * wt;
                    gout26 += trr_10x * hrr_0011y * trr_10z;
                    double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
                    double hrr_2011y = trr_22y - ylyk * trr_21y;
                    gout27 += 1 * hrr_2011y * wt;
                    gout28 += 1 * hrr_1011y * trr_10z;
                    gout29 += 1 * hrr_0011y * trr_20z;
                    gout30 += trr_20x * hrr_0001y * trr_01z;
                    gout31 += trr_10x * hrr_1001y * trr_01z;
                    gout32 += trr_10x * hrr_0001y * trr_11z;
                    gout33 += 1 * hrr_2001y * trr_01z;
                    gout34 += 1 * hrr_1001y * trr_11z;
                    gout35 += 1 * hrr_0001y * trr_21z;
                    double hrr_0001z = trr_01z - zlzk * wt;
                    gout36 += trr_21x * fac * hrr_0001z;
                    gout37 += trr_11x * trr_10y * hrr_0001z;
                    double hrr_1001z = trr_11z - zlzk * trr_10z;
                    gout38 += trr_11x * fac * hrr_1001z;
                    gout39 += trr_01x * trr_20y * hrr_0001z;
                    gout40 += trr_01x * trr_10y * hrr_1001z;
                    double hrr_2001z = trr_21z - zlzk * trr_20z;
                    gout41 += trr_01x * fac * hrr_2001z;
                    gout42 += trr_20x * trr_01y * hrr_0001z;
                    gout43 += trr_10x * trr_11y * hrr_0001z;
                    gout44 += trr_10x * trr_01y * hrr_1001z;
                    gout45 += 1 * trr_21y * hrr_0001z;
                    gout46 += 1 * trr_11y * hrr_1001z;
                    gout47 += 1 * trr_01y * hrr_2001z;
                    double trr_02z = cpz * trr_01z + 1*b01 * wt;
                    double hrr_0011z = trr_02z - zlzk * trr_01z;
                    gout48 += trr_20x * fac * hrr_0011z;
                    gout49 += trr_10x * trr_10y * hrr_0011z;
                    double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
                    double hrr_1011z = trr_12z - zlzk * trr_11z;
                    gout50 += trr_10x * fac * hrr_1011z;
                    gout51 += 1 * trr_20y * hrr_0011z;
                    gout52 += 1 * trr_10y * hrr_1011z;
                    double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
                    double hrr_2011z = trr_22z - zlzk * trr_21z;
                    gout53 += 1 * fac * hrr_2011z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+0)];
                val += gout24 * dm[(j0+0)*nao+(k0+1)];
                val += gout30 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout36 * dm[(j0+0)*nao+(k0+0)];
                val += gout42 * dm[(j0+0)*nao+(k0+1)];
                val += gout48 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+0)];
                val += gout25 * dm[(j0+0)*nao+(k0+1)];
                val += gout31 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout37 * dm[(j0+0)*nao+(k0+0)];
                val += gout43 * dm[(j0+0)*nao+(k0+1)];
                val += gout49 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(k0+0)];
                val += gout26 * dm[(j0+0)*nao+(k0+1)];
                val += gout32 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout38 * dm[(j0+0)*nao+(k0+0)];
                val += gout44 * dm[(j0+0)*nao+(k0+1)];
                val += gout50 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout21 * dm[(j0+0)*nao+(k0+0)];
                val += gout27 * dm[(j0+0)*nao+(k0+1)];
                val += gout33 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout39 * dm[(j0+0)*nao+(k0+0)];
                val += gout45 * dm[(j0+0)*nao+(k0+1)];
                val += gout51 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(k0+0)];
                val += gout28 * dm[(j0+0)*nao+(k0+1)];
                val += gout34 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout40 * dm[(j0+0)*nao+(k0+0)];
                val += gout46 * dm[(j0+0)*nao+(k0+1)];
                val += gout52 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout17 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(k0+0)];
                val += gout29 * dm[(j0+0)*nao+(k0+1)];
                val += gout35 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout41 * dm[(j0+0)*nao+(k0+0)];
                val += gout47 * dm[(j0+0)*nao+(k0+1)];
                val += gout53 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout6 * dm[(i0+0)*nao+(k0+1)];
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout7 * dm[(i0+1)*nao+(k0+1)];
                val += gout13 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout8 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+2)*nao+(k0+2)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+1)];
                val += gout15 * dm[(i0+3)*nao+(k0+2)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+1)];
                val += gout16 * dm[(i0+4)*nao+(k0+2)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout17 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(k0+0)];
                val += gout24 * dm[(i0+0)*nao+(k0+1)];
                val += gout30 * dm[(i0+0)*nao+(k0+2)];
                val += gout19 * dm[(i0+1)*nao+(k0+0)];
                val += gout25 * dm[(i0+1)*nao+(k0+1)];
                val += gout31 * dm[(i0+1)*nao+(k0+2)];
                val += gout20 * dm[(i0+2)*nao+(k0+0)];
                val += gout26 * dm[(i0+2)*nao+(k0+1)];
                val += gout32 * dm[(i0+2)*nao+(k0+2)];
                val += gout21 * dm[(i0+3)*nao+(k0+0)];
                val += gout27 * dm[(i0+3)*nao+(k0+1)];
                val += gout33 * dm[(i0+3)*nao+(k0+2)];
                val += gout22 * dm[(i0+4)*nao+(k0+0)];
                val += gout28 * dm[(i0+4)*nao+(k0+1)];
                val += gout34 * dm[(i0+4)*nao+(k0+2)];
                val += gout23 * dm[(i0+5)*nao+(k0+0)];
                val += gout29 * dm[(i0+5)*nao+(k0+1)];
                val += gout35 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout36 * dm[(i0+0)*nao+(k0+0)];
                val += gout42 * dm[(i0+0)*nao+(k0+1)];
                val += gout48 * dm[(i0+0)*nao+(k0+2)];
                val += gout37 * dm[(i0+1)*nao+(k0+0)];
                val += gout43 * dm[(i0+1)*nao+(k0+1)];
                val += gout49 * dm[(i0+1)*nao+(k0+2)];
                val += gout38 * dm[(i0+2)*nao+(k0+0)];
                val += gout44 * dm[(i0+2)*nao+(k0+1)];
                val += gout50 * dm[(i0+2)*nao+(k0+2)];
                val += gout39 * dm[(i0+3)*nao+(k0+0)];
                val += gout45 * dm[(i0+3)*nao+(k0+1)];
                val += gout51 * dm[(i0+3)*nao+(k0+2)];
                val += gout40 * dm[(i0+4)*nao+(k0+0)];
                val += gout46 * dm[(i0+4)*nao+(k0+1)];
                val += gout52 * dm[(i0+4)*nao+(k0+2)];
                val += gout41 * dm[(i0+5)*nao+(k0+0)];
                val += gout47 * dm[(i0+5)*nao+(k0+1)];
                val += gout53 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+0)*nao+(l0+1)];
                val += gout36 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout24 * dm[(j0+0)*nao+(l0+1)];
                val += gout42 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+0)];
                val += gout30 * dm[(j0+0)*nao+(l0+1)];
                val += gout48 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+0)*nao+(l0+1)];
                val += gout37 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout25 * dm[(j0+0)*nao+(l0+1)];
                val += gout43 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                val += gout31 * dm[(j0+0)*nao+(l0+1)];
                val += gout49 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+0)*nao+(l0+1)];
                val += gout38 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout26 * dm[(j0+0)*nao+(l0+1)];
                val += gout44 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                val += gout32 * dm[(j0+0)*nao+(l0+1)];
                val += gout50 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+0)*nao+(l0+1)];
                val += gout39 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout27 * dm[(j0+0)*nao+(l0+1)];
                val += gout45 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                val += gout33 * dm[(j0+0)*nao+(l0+1)];
                val += gout51 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+0)*nao+(l0+1)];
                val += gout40 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout28 * dm[(j0+0)*nao+(l0+1)];
                val += gout46 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                val += gout34 * dm[(j0+0)*nao+(l0+1)];
                val += gout52 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout23 * dm[(j0+0)*nao+(l0+1)];
                val += gout41 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                val += gout29 * dm[(j0+0)*nao+(l0+1)];
                val += gout47 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                val += gout35 * dm[(j0+0)*nao+(l0+1)];
                val += gout53 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout18 * dm[(i0+0)*nao+(l0+1)];
                val += gout36 * dm[(i0+0)*nao+(l0+2)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout19 * dm[(i0+1)*nao+(l0+1)];
                val += gout37 * dm[(i0+1)*nao+(l0+2)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout20 * dm[(i0+2)*nao+(l0+1)];
                val += gout38 * dm[(i0+2)*nao+(l0+2)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout21 * dm[(i0+3)*nao+(l0+1)];
                val += gout39 * dm[(i0+3)*nao+(l0+2)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout22 * dm[(i0+4)*nao+(l0+1)];
                val += gout40 * dm[(i0+4)*nao+(l0+2)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout23 * dm[(i0+5)*nao+(l0+1)];
                val += gout41 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout24 * dm[(i0+0)*nao+(l0+1)];
                val += gout42 * dm[(i0+0)*nao+(l0+2)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout25 * dm[(i0+1)*nao+(l0+1)];
                val += gout43 * dm[(i0+1)*nao+(l0+2)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout26 * dm[(i0+2)*nao+(l0+1)];
                val += gout44 * dm[(i0+2)*nao+(l0+2)];
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                val += gout27 * dm[(i0+3)*nao+(l0+1)];
                val += gout45 * dm[(i0+3)*nao+(l0+2)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                val += gout28 * dm[(i0+4)*nao+(l0+1)];
                val += gout46 * dm[(i0+4)*nao+(l0+2)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                val += gout29 * dm[(i0+5)*nao+(l0+1)];
                val += gout47 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout30 * dm[(i0+0)*nao+(l0+1)];
                val += gout48 * dm[(i0+0)*nao+(l0+2)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+1)];
                val += gout49 * dm[(i0+1)*nao+(l0+2)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+1)];
                val += gout50 * dm[(i0+2)*nao+(l0+2)];
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout33 * dm[(i0+3)*nao+(l0+1)];
                val += gout51 * dm[(i0+3)*nao+(l0+2)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout34 * dm[(i0+4)*nao+(l0+1)];
                val += gout52 * dm[(i0+4)*nao+(l0+2)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                val += gout35 * dm[(i0+5)*nao+(l0+1)];
                val += gout53 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2020(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double b01 = .5/akl * (1 - rt_akl);
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
                    gout0 += trr_22x * fac * wt;
                    double trr_01x = cpx * 1;
                    double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_12x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_12x * fac * trr_10z;
                    double trr_02x = cpx * trr_01x + 1*b01 * 1;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += trr_02x * trr_20y * wt;
                    gout4 += trr_02x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += trr_02x * fac * trr_20z;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout6 += trr_21x * trr_01y * wt;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout7 += trr_11x * trr_11y * wt;
                    gout8 += trr_11x * trr_01y * trr_10z;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    gout9 += trr_01x * trr_21y * wt;
                    gout10 += trr_01x * trr_11y * trr_10z;
                    gout11 += trr_01x * trr_01y * trr_20z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout12 += trr_21x * fac * trr_01z;
                    gout13 += trr_11x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout14 += trr_11x * fac * trr_11z;
                    gout15 += trr_01x * trr_20y * trr_01z;
                    gout16 += trr_01x * trr_10y * trr_11z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    gout17 += trr_01x * fac * trr_21z;
                    double trr_02y = cpy * trr_01y + 1*b01 * fac;
                    gout18 += trr_20x * trr_02y * wt;
                    double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
                    gout19 += trr_10x * trr_12y * wt;
                    gout20 += trr_10x * trr_02y * trr_10z;
                    double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
                    gout21 += 1 * trr_22y * wt;
                    gout22 += 1 * trr_12y * trr_10z;
                    gout23 += 1 * trr_02y * trr_20z;
                    gout24 += trr_20x * trr_01y * trr_01z;
                    gout25 += trr_10x * trr_11y * trr_01z;
                    gout26 += trr_10x * trr_01y * trr_11z;
                    gout27 += 1 * trr_21y * trr_01z;
                    gout28 += 1 * trr_11y * trr_11z;
                    gout29 += 1 * trr_01y * trr_21z;
                    double trr_02z = cpz * trr_01z + 1*b01 * wt;
                    gout30 += trr_20x * fac * trr_02z;
                    gout31 += trr_10x * trr_10y * trr_02z;
                    double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
                    gout32 += trr_10x * fac * trr_12z;
                    gout33 += 1 * trr_20y * trr_02z;
                    gout34 += 1 * trr_10y * trr_12z;
                    double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
                    gout35 += 1 * fac * trr_22z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                val += gout18 * dm[(j0+0)*nao+(k0+3)];
                val += gout24 * dm[(j0+0)*nao+(k0+4)];
                val += gout30 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                val += gout19 * dm[(j0+0)*nao+(k0+3)];
                val += gout25 * dm[(j0+0)*nao+(k0+4)];
                val += gout31 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                val += gout20 * dm[(j0+0)*nao+(k0+3)];
                val += gout26 * dm[(j0+0)*nao+(k0+4)];
                val += gout32 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                val += gout21 * dm[(j0+0)*nao+(k0+3)];
                val += gout27 * dm[(j0+0)*nao+(k0+4)];
                val += gout33 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                val += gout22 * dm[(j0+0)*nao+(k0+3)];
                val += gout28 * dm[(j0+0)*nao+(k0+4)];
                val += gout34 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout17 * dm[(j0+0)*nao+(k0+2)];
                val += gout23 * dm[(j0+0)*nao+(k0+3)];
                val += gout29 * dm[(j0+0)*nao+(k0+4)];
                val += gout35 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout6 * dm[(i0+0)*nao+(k0+1)];
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout18 * dm[(i0+0)*nao+(k0+3)];
                val += gout24 * dm[(i0+0)*nao+(k0+4)];
                val += gout30 * dm[(i0+0)*nao+(k0+5)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout7 * dm[(i0+1)*nao+(k0+1)];
                val += gout13 * dm[(i0+1)*nao+(k0+2)];
                val += gout19 * dm[(i0+1)*nao+(k0+3)];
                val += gout25 * dm[(i0+1)*nao+(k0+4)];
                val += gout31 * dm[(i0+1)*nao+(k0+5)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout8 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+2)*nao+(k0+2)];
                val += gout20 * dm[(i0+2)*nao+(k0+3)];
                val += gout26 * dm[(i0+2)*nao+(k0+4)];
                val += gout32 * dm[(i0+2)*nao+(k0+5)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+1)];
                val += gout15 * dm[(i0+3)*nao+(k0+2)];
                val += gout21 * dm[(i0+3)*nao+(k0+3)];
                val += gout27 * dm[(i0+3)*nao+(k0+4)];
                val += gout33 * dm[(i0+3)*nao+(k0+5)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+1)];
                val += gout16 * dm[(i0+4)*nao+(k0+2)];
                val += gout22 * dm[(i0+4)*nao+(k0+3)];
                val += gout28 * dm[(i0+4)*nao+(k0+4)];
                val += gout34 * dm[(i0+4)*nao+(k0+5)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout17 * dm[(i0+5)*nao+(k0+2)];
                val += gout23 * dm[(i0+5)*nao+(k0+3)];
                val += gout29 * dm[(i0+5)*nao+(k0+4)];
                val += gout35 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+3), val);
                val = 0;
                val += gout24 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+4), val);
                val = 0;
                val += gout30 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+3), val);
                val = 0;
                val += gout25 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+4), val);
                val = 0;
                val += gout31 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+3), val);
                val = 0;
                val += gout26 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+4), val);
                val = 0;
                val += gout32 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+5), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout21 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+3), val);
                val = 0;
                val += gout27 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+4), val);
                val = 0;
                val += gout33 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+5), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+3), val);
                val = 0;
                val += gout28 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+4), val);
                val = 0;
                val += gout34 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+5), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+3), val);
                val = 0;
                val += gout29 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+4), val);
                val = 0;
                val += gout35 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                val += gout21 * dm[(i0+3)*nao+(l0+0)];
                val += gout22 * dm[(i0+4)*nao+(l0+0)];
                val += gout23 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                val += gout27 * dm[(i0+3)*nao+(l0+0)];
                val += gout28 * dm[(i0+4)*nao+(l0+0)];
                val += gout29 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+0)];
                val += gout33 * dm[(i0+3)*nao+(l0+0)];
                val += gout34 * dm[(i0+4)*nao+(l0+0)];
                val += gout35 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2021(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int gout_id = threadIdx.y;
    int thread_id = 64 * gout_id + sq_id;
    int threads = 256;
    constexpr int nsq_per_block = 64;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (thread_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (thread_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    constexpr int g_size = 18;

    extern __shared__ double shared_memory[];
    double *rlrk = shared_memory + sq_id;
    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
    double *akl_cache = shared_memory + nsq_per_block * 6 + sq_id;
    double *fac_ijkl = shared_memory + nsq_per_block * 8 + sq_id;
    double *gx = shared_memory + nsq_per_block * 9 + sq_id;
    double *rw = shared_memory + nsq_per_block * (g_size*3+9) + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * (g_size*3+bounds.nroots*2+9);

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double aij_cache[2];
    __shared__ double *expi;
    __shared__ double *expj;
    if (thread_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (thread_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[thread_id] = env[ri_ptr+thread_id];
        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }

    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        int iprim = bounds.iprim;
        int jprim = bounds.jprim;
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        if (gout_id == 0) {
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            rlrk[0] = xlxk;
            rlrk[64] = ylyk;
            rlrk[128] = zlzk;
            fac_ijkl[0] = fac_sym;
        }
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            __syncthreads();
            if (gout_id == 0) {
                double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
                double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
                double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
                double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
                int kp = klp / lprim;
                int lp = klp % lprim;
                double ak = expk[kp];
                double al = expl[lp];
                double akl = ak + al;
                double al_akl = al / akl;
                double xlxk = rlrk[0];
                double ylyk = rlrk[64];
                double zlzk = rlrk[128];
                double theta_kl = ak * al_akl;
                double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
                double ckcl = ck[kp] * cl[lp] * Kcd;
                double fac_sym = fac_ijkl[0];
                gx[0] = fac_sym * ckcl;
                akl_cache[0] = akl;
                akl_cache[nsq_per_block] = al_akl;
            }
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double akl = akl_cache[0];
                double al_akl = akl_cache[nsq_per_block];
                double xij = ri[0] + (rjri[0]) * aj_aij;
                double yij = ri[1] + (rjri[1]) * aj_aij;
                double zij = ri[2] + (rjri[2]) * aj_aij;
                double xkl = rk[0] + rlrk[0*nsq_per_block] * al_akl;
                double ykl = rk[1] + rlrk[1*nsq_per_block] * al_akl;
                double zkl = rk[2] + rlrk[2*nsq_per_block] * al_akl;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                if (gout_id == 0) {
                    Rpq[0*nsq_per_block] = xpq;
                    Rpq[1*nsq_per_block] = ypq;
                    Rpq[2*nsq_per_block] = zpq;
                    double cicj = cicj_cache[ijp];
                    gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
                    if (sq_id == 0) {
                        aij_cache[0] = aij;
                        aij_cache[1] = aj_aij;
                    }
                }
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                for (int irys = 0; irys < nroots; ++irys) {
                    __syncthreads();
                    double s0, s1, s2;
                    double rt = rw[irys*128];
                    double aij = aij_cache[0];
                    double rt_aa = rt / (aij + akl);
                    double akl = akl_cache[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double rt_akl = rt_aa * aij;
                    double b00 = .5 * rt_aa;
                    double b01 = .5/akl * (1 - rt_akl);
                    for (int n = gout_id; n < 3; n += 4) {
                        if (n == 2) {
                            gx[2304] = rw[irys*128+64];
                        }
                        double *_gx = gx + n * 1152;
                        double xjxi = rjri[n];
                        double Rpa = xjxi * aij_cache[1];
                        double c0x = Rpa - rt_aij * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = c0x * s0;
                        _gx[64] = s1;
                        s2 = c0x * s1 + 1 * b10 * s0;
                        _gx[128] = s2;
                        double xlxk = rlrk[n*64];
                        double Rqc = xlxk * akl_cache[64];
                        double cpx = Rqc + rt_akl * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = cpx * s0;
                        _gx[192] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        _gx[384] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = cpx*s1 + 2 * b01 *s0;
                        _gx[576] = s2;
                        s0 = _gx[64];
                        s1 = cpx * s0;
                        s1 += 1 * b00 * _gx[0];
                        _gx[256] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 1 * b00 * _gx[192];
                        _gx[448] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = cpx*s1 + 2 * b01 *s0;
                        s2 += 1 * b00 * _gx[384];
                        _gx[640] = s2;
                        s0 = _gx[128];
                        s1 = cpx * s0;
                        s1 += 2 * b00 * _gx[64];
                        _gx[320] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 2 * b00 * _gx[256];
                        _gx[512] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = cpx*s1 + 2 * b01 *s0;
                        s2 += 2 * b00 * _gx[448];
                        _gx[704] = s2;
                        s1 = _gx[576];
                        s0 = _gx[384];
                        _gx[960] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[192];
                        _gx[768] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[576] = s1 - xlxk * s0;
                        s1 = _gx[640];
                        s0 = _gx[448];
                        _gx[1024] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[256];
                        _gx[832] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[64];
                        _gx[640] = s1 - xlxk * s0;
                        s1 = _gx[704];
                        s0 = _gx[512];
                        _gx[1088] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[320];
                        _gx[896] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[128];
                        _gx[704] = s1 - xlxk * s0;
                    }
                    __syncthreads();
                    switch (gout_id) {
                    case 0:
                    gout0 += gx[1088] * gx[1152] * gx[2304];
                    gout1 += gx[960] * gx[1216] * gx[2368];
                    gout2 += gx[832] * gx[1344] * gx[2368];
                    gout3 += gx[896] * gx[1152] * gx[2496];
                    gout4 += gx[768] * gx[1216] * gx[2560];
                    gout5 += gx[640] * gx[1536] * gx[2368];
                    gout6 += gx[704] * gx[1344] * gx[2496];
                    gout7 += gx[576] * gx[1408] * gx[2560];
                    gout8 += gx[640] * gx[1152] * gx[2752];
                    gout9 += gx[512] * gx[1728] * gx[2304];
                    gout10 += gx[384] * gx[1792] * gx[2368];
                    gout11 += gx[256] * gx[1920] * gx[2368];
                    gout12 += gx[320] * gx[1728] * gx[2496];
                    gout13 += gx[192] * gx[1792] * gx[2560];
                    gout14 += gx[64] * gx[2112] * gx[2368];
                    gout15 += gx[128] * gx[1920] * gx[2496];
                    gout16 += gx[0] * gx[1984] * gx[2560];
                    gout17 += gx[64] * gx[1728] * gx[2752];
                    gout18 += gx[512] * gx[1152] * gx[2880];
                    gout19 += gx[384] * gx[1216] * gx[2944];
                    gout20 += gx[256] * gx[1344] * gx[2944];
                    gout21 += gx[320] * gx[1152] * gx[3072];
                    gout22 += gx[192] * gx[1216] * gx[3136];
                    gout23 += gx[64] * gx[1536] * gx[2944];
                    gout24 += gx[128] * gx[1344] * gx[3072];
                    gout25 += gx[0] * gx[1408] * gx[3136];
                    gout26 += gx[64] * gx[1152] * gx[3328];
                    break;
                    case 1:
                    gout0 += gx[1024] * gx[1216] * gx[2304];
                    gout1 += gx[960] * gx[1152] * gx[2432];
                    gout2 += gx[768] * gx[1472] * gx[2304];
                    gout3 += gx[832] * gx[1216] * gx[2496];
                    gout4 += gx[768] * gx[1152] * gx[2624];
                    gout5 += gx[576] * gx[1664] * gx[2304];
                    gout6 += gx[640] * gx[1408] * gx[2496];
                    gout7 += gx[576] * gx[1344] * gx[2624];
                    gout8 += gx[576] * gx[1280] * gx[2688];
                    gout9 += gx[448] * gx[1792] * gx[2304];
                    gout10 += gx[384] * gx[1728] * gx[2432];
                    gout11 += gx[192] * gx[2048] * gx[2304];
                    gout12 += gx[256] * gx[1792] * gx[2496];
                    gout13 += gx[192] * gx[1728] * gx[2624];
                    gout14 += gx[0] * gx[2240] * gx[2304];
                    gout15 += gx[64] * gx[1984] * gx[2496];
                    gout16 += gx[0] * gx[1920] * gx[2624];
                    gout17 += gx[0] * gx[1856] * gx[2688];
                    gout18 += gx[448] * gx[1216] * gx[2880];
                    gout19 += gx[384] * gx[1152] * gx[3008];
                    gout20 += gx[192] * gx[1472] * gx[2880];
                    gout21 += gx[256] * gx[1216] * gx[3072];
                    gout22 += gx[192] * gx[1152] * gx[3200];
                    gout23 += gx[0] * gx[1664] * gx[2880];
                    gout24 += gx[64] * gx[1408] * gx[3072];
                    gout25 += gx[0] * gx[1344] * gx[3200];
                    gout26 += gx[0] * gx[1280] * gx[3264];
                    break;
                    case 2:
                    gout0 += gx[1024] * gx[1152] * gx[2368];
                    gout1 += gx[896] * gx[1344] * gx[2304];
                    gout2 += gx[768] * gx[1408] * gx[2368];
                    gout3 += gx[832] * gx[1152] * gx[2560];
                    gout4 += gx[704] * gx[1536] * gx[2304];
                    gout5 += gx[576] * gx[1600] * gx[2368];
                    gout6 += gx[640] * gx[1344] * gx[2560];
                    gout7 += gx[704] * gx[1152] * gx[2688];
                    gout8 += gx[576] * gx[1216] * gx[2752];
                    gout9 += gx[448] * gx[1728] * gx[2368];
                    gout10 += gx[320] * gx[1920] * gx[2304];
                    gout11 += gx[192] * gx[1984] * gx[2368];
                    gout12 += gx[256] * gx[1728] * gx[2560];
                    gout13 += gx[128] * gx[2112] * gx[2304];
                    gout14 += gx[0] * gx[2176] * gx[2368];
                    gout15 += gx[64] * gx[1920] * gx[2560];
                    gout16 += gx[128] * gx[1728] * gx[2688];
                    gout17 += gx[0] * gx[1792] * gx[2752];
                    gout18 += gx[448] * gx[1152] * gx[2944];
                    gout19 += gx[320] * gx[1344] * gx[2880];
                    gout20 += gx[192] * gx[1408] * gx[2944];
                    gout21 += gx[256] * gx[1152] * gx[3136];
                    gout22 += gx[128] * gx[1536] * gx[2880];
                    gout23 += gx[0] * gx[1600] * gx[2944];
                    gout24 += gx[64] * gx[1344] * gx[3136];
                    gout25 += gx[128] * gx[1152] * gx[3264];
                    gout26 += gx[0] * gx[1216] * gx[3328];
                    break;
                    case 3:
                    gout0 += gx[960] * gx[1280] * gx[2304];
                    gout1 += gx[832] * gx[1408] * gx[2304];
                    gout2 += gx[768] * gx[1344] * gx[2432];
                    gout3 += gx[768] * gx[1280] * gx[2496];
                    gout4 += gx[640] * gx[1600] * gx[2304];
                    gout5 += gx[576] * gx[1536] * gx[2432];
                    gout6 += gx[576] * gx[1472] * gx[2496];
                    gout7 += gx[640] * gx[1216] * gx[2688];
                    gout8 += gx[576] * gx[1152] * gx[2816];
                    gout9 += gx[384] * gx[1856] * gx[2304];
                    gout10 += gx[256] * gx[1984] * gx[2304];
                    gout11 += gx[192] * gx[1920] * gx[2432];
                    gout12 += gx[192] * gx[1856] * gx[2496];
                    gout13 += gx[64] * gx[2176] * gx[2304];
                    gout14 += gx[0] * gx[2112] * gx[2432];
                    gout15 += gx[0] * gx[2048] * gx[2496];
                    gout16 += gx[64] * gx[1792] * gx[2688];
                    gout17 += gx[0] * gx[1728] * gx[2816];
                    gout18 += gx[384] * gx[1280] * gx[2880];
                    gout19 += gx[256] * gx[1408] * gx[2880];
                    gout20 += gx[192] * gx[1344] * gx[3008];
                    gout21 += gx[192] * gx[1280] * gx[3072];
                    gout22 += gx[64] * gx[1600] * gx[2880];
                    gout23 += gx[0] * gx[1536] * gx[3008];
                    gout24 += gx[0] * gx[1472] * gx[3072];
                    gout25 += gx[64] * gx[1216] * gx[3264];
                    gout26 += gx[0] * gx[1152] * gx[3392];
                    break;
                    }
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                switch (gout_id) {
                case 0:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+0)*nao+(k0+2)];
                val += gout6 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                val += gout24 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+0)*nao+(k0+3)];
                val += gout8 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout17 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(k0+1)];
                val += gout23 * dm[(j0+0)*nao+(k0+3)];
                val += gout26 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout4 * dm[(j0+0)*nao+(k0+2)];
                val += gout7 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                val += gout16 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+0)];
                val += gout22 * dm[(j0+0)*nao+(k0+2)];
                val += gout25 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout3 * dm[(i0+0)*nao+(k0+2)];
                val += gout6 * dm[(i0+0)*nao+(k0+4)];
                val += gout2 * dm[(i0+2)*nao+(k0+1)];
                val += gout5 * dm[(i0+2)*nao+(k0+3)];
                val += gout8 * dm[(i0+2)*nao+(k0+5)];
                val += gout1 * dm[(i0+4)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+2)];
                val += gout7 * dm[(i0+4)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout15 * dm[(i0+0)*nao+(k0+4)];
                val += gout11 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+2)*nao+(k0+3)];
                val += gout17 * dm[(i0+2)*nao+(k0+5)];
                val += gout10 * dm[(i0+4)*nao+(k0+0)];
                val += gout13 * dm[(i0+4)*nao+(k0+2)];
                val += gout16 * dm[(i0+4)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(k0+0)];
                val += gout21 * dm[(i0+0)*nao+(k0+2)];
                val += gout24 * dm[(i0+0)*nao+(k0+4)];
                val += gout20 * dm[(i0+2)*nao+(k0+1)];
                val += gout23 * dm[(i0+2)*nao+(k0+3)];
                val += gout26 * dm[(i0+2)*nao+(k0+5)];
                val += gout19 * dm[(i0+4)*nao+(k0+0)];
                val += gout22 * dm[(i0+4)*nao+(k0+2)];
                val += gout25 * dm[(i0+4)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+1)];
                val += gout24 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+4), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                val += gout23 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+3), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+1)];
                val += gout26 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+1)];
                val += gout25 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+4), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout9 * dm[(i0+0)*nao+(l0+1)];
                val += gout18 * dm[(i0+0)*nao+(l0+2)];
                val += gout1 * dm[(i0+4)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+1)];
                val += gout19 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout11 * dm[(i0+2)*nao+(l0+1)];
                val += gout20 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout12 * dm[(i0+0)*nao+(l0+1)];
                val += gout21 * dm[(i0+0)*nao+(l0+2)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout13 * dm[(i0+4)*nao+(l0+1)];
                val += gout22 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+1)];
                val += gout23 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout15 * dm[(i0+0)*nao+(l0+1)];
                val += gout24 * dm[(i0+0)*nao+(l0+2)];
                val += gout7 * dm[(i0+4)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+1)];
                val += gout25 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout17 * dm[(i0+2)*nao+(l0+1)];
                val += gout26 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                break;
                case 1:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+0)*nao+(k0+2)];
                val += gout6 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                val += gout24 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+0)*nao+(k0+3)];
                val += gout8 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout17 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(k0+1)];
                val += gout23 * dm[(j0+0)*nao+(k0+3)];
                val += gout26 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout4 * dm[(j0+0)*nao+(k0+2)];
                val += gout7 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                val += gout16 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+0)];
                val += gout22 * dm[(j0+0)*nao+(k0+2)];
                val += gout25 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout3 * dm[(i0+1)*nao+(k0+2)];
                val += gout6 * dm[(i0+1)*nao+(k0+4)];
                val += gout2 * dm[(i0+3)*nao+(k0+1)];
                val += gout5 * dm[(i0+3)*nao+(k0+3)];
                val += gout8 * dm[(i0+3)*nao+(k0+5)];
                val += gout1 * dm[(i0+5)*nao+(k0+0)];
                val += gout4 * dm[(i0+5)*nao+(k0+2)];
                val += gout7 * dm[(i0+5)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+1)*nao+(k0+2)];
                val += gout15 * dm[(i0+1)*nao+(k0+4)];
                val += gout11 * dm[(i0+3)*nao+(k0+1)];
                val += gout14 * dm[(i0+3)*nao+(k0+3)];
                val += gout17 * dm[(i0+3)*nao+(k0+5)];
                val += gout10 * dm[(i0+5)*nao+(k0+0)];
                val += gout13 * dm[(i0+5)*nao+(k0+2)];
                val += gout16 * dm[(i0+5)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(k0+0)];
                val += gout21 * dm[(i0+1)*nao+(k0+2)];
                val += gout24 * dm[(i0+1)*nao+(k0+4)];
                val += gout20 * dm[(i0+3)*nao+(k0+1)];
                val += gout23 * dm[(i0+3)*nao+(k0+3)];
                val += gout26 * dm[(i0+3)*nao+(k0+5)];
                val += gout19 * dm[(i0+5)*nao+(k0+0)];
                val += gout22 * dm[(i0+5)*nao+(k0+2)];
                val += gout25 * dm[(i0+5)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+1)];
                val += gout24 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+4), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                val += gout23 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+3), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+1)];
                val += gout26 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+1)];
                val += gout25 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+4), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout9 * dm[(i0+1)*nao+(l0+1)];
                val += gout18 * dm[(i0+1)*nao+(l0+2)];
                val += gout1 * dm[(i0+5)*nao+(l0+0)];
                val += gout10 * dm[(i0+5)*nao+(l0+1)];
                val += gout19 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+3)*nao+(l0+0)];
                val += gout11 * dm[(i0+3)*nao+(l0+1)];
                val += gout20 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(l0+0)];
                val += gout12 * dm[(i0+1)*nao+(l0+1)];
                val += gout21 * dm[(i0+1)*nao+(l0+2)];
                val += gout4 * dm[(i0+5)*nao+(l0+0)];
                val += gout13 * dm[(i0+5)*nao+(l0+1)];
                val += gout22 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(l0+0)];
                val += gout14 * dm[(i0+3)*nao+(l0+1)];
                val += gout23 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(l0+0)];
                val += gout15 * dm[(i0+1)*nao+(l0+1)];
                val += gout24 * dm[(i0+1)*nao+(l0+2)];
                val += gout7 * dm[(i0+5)*nao+(l0+0)];
                val += gout16 * dm[(i0+5)*nao+(l0+1)];
                val += gout25 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout8 * dm[(i0+3)*nao+(l0+0)];
                val += gout17 * dm[(i0+3)*nao+(l0+1)];
                val += gout26 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                break;
                case 2:
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+1)];
                val += gout4 * dm[(j0+0)*nao+(k0+3)];
                val += gout7 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+3)];
                val += gout16 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                val += gout22 * dm[(j0+0)*nao+(k0+3)];
                val += gout25 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+0)*nao+(k0+2)];
                val += gout6 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                val += gout24 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+0)*nao+(k0+3)];
                val += gout8 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout17 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(k0+1)];
                val += gout23 * dm[(j0+0)*nao+(k0+3)];
                val += gout26 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(k0+1)];
                val += gout4 * dm[(i0+0)*nao+(k0+3)];
                val += gout7 * dm[(i0+0)*nao+(k0+5)];
                val += gout0 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+2)*nao+(k0+2)];
                val += gout6 * dm[(i0+2)*nao+(k0+4)];
                val += gout2 * dm[(i0+4)*nao+(k0+1)];
                val += gout5 * dm[(i0+4)*nao+(k0+3)];
                val += gout8 * dm[(i0+4)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout13 * dm[(i0+0)*nao+(k0+3)];
                val += gout16 * dm[(i0+0)*nao+(k0+5)];
                val += gout9 * dm[(i0+2)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+2)];
                val += gout15 * dm[(i0+2)*nao+(k0+4)];
                val += gout11 * dm[(i0+4)*nao+(k0+1)];
                val += gout14 * dm[(i0+4)*nao+(k0+3)];
                val += gout17 * dm[(i0+4)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+0)*nao+(k0+1)];
                val += gout22 * dm[(i0+0)*nao+(k0+3)];
                val += gout25 * dm[(i0+0)*nao+(k0+5)];
                val += gout18 * dm[(i0+2)*nao+(k0+0)];
                val += gout21 * dm[(i0+2)*nao+(k0+2)];
                val += gout24 * dm[(i0+2)*nao+(k0+4)];
                val += gout20 * dm[(i0+4)*nao+(k0+1)];
                val += gout23 * dm[(i0+4)*nao+(k0+3)];
                val += gout26 * dm[(i0+4)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+3), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+1)];
                val += gout25 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+1)];
                val += gout24 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+4), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                val += gout23 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+3), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+1)];
                val += gout26 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+2)*nao+(l0+1)];
                val += gout18 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(l0+0)];
                val += gout10 * dm[(i0+0)*nao+(l0+1)];
                val += gout19 * dm[(i0+0)*nao+(l0+2)];
                val += gout2 * dm[(i0+4)*nao+(l0+0)];
                val += gout11 * dm[(i0+4)*nao+(l0+1)];
                val += gout20 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout3 * dm[(i0+2)*nao+(l0+0)];
                val += gout12 * dm[(i0+2)*nao+(l0+1)];
                val += gout21 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+0)*nao+(l0+1)];
                val += gout22 * dm[(i0+0)*nao+(l0+2)];
                val += gout5 * dm[(i0+4)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+1)];
                val += gout23 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout6 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+2)*nao+(l0+1)];
                val += gout24 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(l0+0)];
                val += gout16 * dm[(i0+0)*nao+(l0+1)];
                val += gout25 * dm[(i0+0)*nao+(l0+2)];
                val += gout8 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+4)*nao+(l0+1)];
                val += gout26 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                break;
                case 3:
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+1)];
                val += gout4 * dm[(j0+0)*nao+(k0+3)];
                val += gout7 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+3)];
                val += gout16 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                val += gout22 * dm[(j0+0)*nao+(k0+3)];
                val += gout25 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+0)*nao+(k0+2)];
                val += gout6 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                val += gout24 * dm[(j0+0)*nao+(k0+4)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+0)*nao+(k0+3)];
                val += gout8 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout17 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(k0+1)];
                val += gout23 * dm[(j0+0)*nao+(k0+3)];
                val += gout26 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(k0+1)];
                val += gout4 * dm[(i0+1)*nao+(k0+3)];
                val += gout7 * dm[(i0+1)*nao+(k0+5)];
                val += gout0 * dm[(i0+3)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+2)];
                val += gout6 * dm[(i0+3)*nao+(k0+4)];
                val += gout2 * dm[(i0+5)*nao+(k0+1)];
                val += gout5 * dm[(i0+5)*nao+(k0+3)];
                val += gout8 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout13 * dm[(i0+1)*nao+(k0+3)];
                val += gout16 * dm[(i0+1)*nao+(k0+5)];
                val += gout9 * dm[(i0+3)*nao+(k0+0)];
                val += gout12 * dm[(i0+3)*nao+(k0+2)];
                val += gout15 * dm[(i0+3)*nao+(k0+4)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout14 * dm[(i0+5)*nao+(k0+3)];
                val += gout17 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+1)*nao+(k0+1)];
                val += gout22 * dm[(i0+1)*nao+(k0+3)];
                val += gout25 * dm[(i0+1)*nao+(k0+5)];
                val += gout18 * dm[(i0+3)*nao+(k0+0)];
                val += gout21 * dm[(i0+3)*nao+(k0+2)];
                val += gout24 * dm[(i0+3)*nao+(k0+4)];
                val += gout20 * dm[(i0+5)*nao+(k0+1)];
                val += gout23 * dm[(i0+5)*nao+(k0+3)];
                val += gout26 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+3), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+1)];
                val += gout25 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+1)];
                val += gout24 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+4), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                val += gout23 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+3), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+1)];
                val += gout26 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+3)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+1)];
                val += gout18 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout10 * dm[(i0+1)*nao+(l0+1)];
                val += gout19 * dm[(i0+1)*nao+(l0+2)];
                val += gout2 * dm[(i0+5)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+1)];
                val += gout20 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout12 * dm[(i0+3)*nao+(l0+1)];
                val += gout21 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+1)];
                val += gout22 * dm[(i0+1)*nao+(l0+2)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout14 * dm[(i0+5)*nao+(l0+1)];
                val += gout23 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout6 * dm[(i0+3)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+1)];
                val += gout24 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout16 * dm[(i0+1)*nao+(l0+1)];
                val += gout25 * dm[(i0+1)*nao+(l0+2)];
                val += gout8 * dm[(i0+5)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+1)];
                val += gout26 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                break;
                }
            }
        }
    }
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_2100(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double hrr_2100x = trr_30x - xjxi * trr_20x;
                    gout0 += hrr_2100x * fac * wt;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_1100x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_1100x * fac * trr_10z;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += hrr_0100x * trr_20y * wt;
                    gout4 += hrr_0100x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += hrr_0100x * fac * trr_20z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout6 += trr_20x * hrr_0100y * wt;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout7 += trr_10x * hrr_1100y * wt;
                    gout8 += trr_10x * hrr_0100y * trr_10z;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    double hrr_2100y = trr_30y - yjyi * trr_20y;
                    gout9 += 1 * hrr_2100y * wt;
                    gout10 += 1 * hrr_1100y * trr_10z;
                    gout11 += 1 * hrr_0100y * trr_20z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout12 += trr_20x * fac * hrr_0100z;
                    gout13 += trr_10x * trr_10y * hrr_0100z;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout14 += trr_10x * fac * hrr_1100z;
                    gout15 += 1 * trr_20y * hrr_0100z;
                    gout16 += 1 * trr_10y * hrr_1100z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    double hrr_2100z = trr_30z - zjzi * trr_20z;
                    gout17 += 1 * fac * hrr_2100z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+1)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout13 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout8 * dm[(j0+1)*nao+(k0+0)];
                val += gout14 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+1)*nao+(k0+0)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+0)];
                val += gout16 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(k0+0)];
                val += gout13 * dm[(i0+1)*nao+(k0+0)];
                val += gout14 * dm[(i0+2)*nao+(k0+0)];
                val += gout15 * dm[(i0+3)*nao+(k0+0)];
                val += gout16 * dm[(i0+4)*nao+(k0+0)];
                val += gout17 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+1)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+1)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2110(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        double gout36;
        double gout37;
        double gout38;
        double gout39;
        double gout40;
        double gout41;
        double gout42;
        double gout43;
        double gout44;
        double gout45;
        double gout46;
        double gout47;
        double gout48;
        double gout49;
        double gout50;
        double gout51;
        double gout52;
        double gout53;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        gout36 = 0;
        gout37 = 0;
        gout38 = 0;
        gout39 = 0;
        gout40 = 0;
        gout41 = 0;
        gout42 = 0;
        gout43 = 0;
        gout44 = 0;
        gout45 = 0;
        gout46 = 0;
        gout47 = 0;
        gout48 = 0;
        gout49 = 0;
        gout50 = 0;
        gout51 = 0;
        gout52 = 0;
        gout53 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double trr_31x = cpx * trr_30x + 3*b00 * trr_20x;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double hrr_2110x = trr_31x - xjxi * trr_21x;
                    gout0 += hrr_2110x * fac * wt;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double hrr_1110x = trr_21x - xjxi * trr_11x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_1110x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_1110x * fac * trr_10z;
                    double trr_01x = cpx * 1;
                    double hrr_0110x = trr_11x - xjxi * trr_01x;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += hrr_0110x * trr_20y * wt;
                    gout4 += hrr_0110x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += hrr_0110x * fac * trr_20z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout6 += trr_21x * hrr_0100y * wt;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout7 += trr_11x * hrr_1100y * wt;
                    gout8 += trr_11x * hrr_0100y * trr_10z;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    double hrr_2100y = trr_30y - yjyi * trr_20y;
                    gout9 += trr_01x * hrr_2100y * wt;
                    gout10 += trr_01x * hrr_1100y * trr_10z;
                    gout11 += trr_01x * hrr_0100y * trr_20z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout12 += trr_21x * fac * hrr_0100z;
                    gout13 += trr_11x * trr_10y * hrr_0100z;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout14 += trr_11x * fac * hrr_1100z;
                    gout15 += trr_01x * trr_20y * hrr_0100z;
                    gout16 += trr_01x * trr_10y * hrr_1100z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    double hrr_2100z = trr_30z - zjzi * trr_20z;
                    gout17 += trr_01x * fac * hrr_2100z;
                    double hrr_2100x = trr_30x - xjxi * trr_20x;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout18 += hrr_2100x * trr_01y * wt;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout19 += hrr_1100x * trr_11y * wt;
                    gout20 += hrr_1100x * trr_01y * trr_10z;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    gout21 += hrr_0100x * trr_21y * wt;
                    gout22 += hrr_0100x * trr_11y * trr_10z;
                    gout23 += hrr_0100x * trr_01y * trr_20z;
                    double hrr_0110y = trr_11y - yjyi * trr_01y;
                    gout24 += trr_20x * hrr_0110y * wt;
                    double hrr_1110y = trr_21y - yjyi * trr_11y;
                    gout25 += trr_10x * hrr_1110y * wt;
                    gout26 += trr_10x * hrr_0110y * trr_10z;
                    double trr_31y = cpy * trr_30y + 3*b00 * trr_20y;
                    double hrr_2110y = trr_31y - yjyi * trr_21y;
                    gout27 += 1 * hrr_2110y * wt;
                    gout28 += 1 * hrr_1110y * trr_10z;
                    gout29 += 1 * hrr_0110y * trr_20z;
                    gout30 += trr_20x * trr_01y * hrr_0100z;
                    gout31 += trr_10x * trr_11y * hrr_0100z;
                    gout32 += trr_10x * trr_01y * hrr_1100z;
                    gout33 += 1 * trr_21y * hrr_0100z;
                    gout34 += 1 * trr_11y * hrr_1100z;
                    gout35 += 1 * trr_01y * hrr_2100z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout36 += hrr_2100x * fac * trr_01z;
                    gout37 += hrr_1100x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout38 += hrr_1100x * fac * trr_11z;
                    gout39 += hrr_0100x * trr_20y * trr_01z;
                    gout40 += hrr_0100x * trr_10y * trr_11z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    gout41 += hrr_0100x * fac * trr_21z;
                    gout42 += trr_20x * hrr_0100y * trr_01z;
                    gout43 += trr_10x * hrr_1100y * trr_01z;
                    gout44 += trr_10x * hrr_0100y * trr_11z;
                    gout45 += 1 * hrr_2100y * trr_01z;
                    gout46 += 1 * hrr_1100y * trr_11z;
                    gout47 += 1 * hrr_0100y * trr_21z;
                    double hrr_0110z = trr_11z - zjzi * trr_01z;
                    gout48 += trr_20x * fac * hrr_0110z;
                    gout49 += trr_10x * trr_10y * hrr_0110z;
                    double hrr_1110z = trr_21z - zjzi * trr_11z;
                    gout50 += trr_10x * fac * hrr_1110z;
                    gout51 += 1 * trr_20y * hrr_0110z;
                    gout52 += 1 * trr_10y * hrr_1110z;
                    double trr_31z = cpz * trr_30z + 3*b00 * trr_20z;
                    double hrr_2110z = trr_31z - zjzi * trr_21z;
                    gout53 += 1 * fac * hrr_2110z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                val += gout36 * dm[(j0+0)*nao+(k0+2)];
                val += gout6 * dm[(j0+1)*nao+(k0+0)];
                val += gout24 * dm[(j0+1)*nao+(k0+1)];
                val += gout42 * dm[(j0+1)*nao+(k0+2)];
                val += gout12 * dm[(j0+2)*nao+(k0+0)];
                val += gout30 * dm[(j0+2)*nao+(k0+1)];
                val += gout48 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                val += gout37 * dm[(j0+0)*nao+(k0+2)];
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout25 * dm[(j0+1)*nao+(k0+1)];
                val += gout43 * dm[(j0+1)*nao+(k0+2)];
                val += gout13 * dm[(j0+2)*nao+(k0+0)];
                val += gout31 * dm[(j0+2)*nao+(k0+1)];
                val += gout49 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout20 * dm[(j0+0)*nao+(k0+1)];
                val += gout38 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+1)*nao+(k0+0)];
                val += gout26 * dm[(j0+1)*nao+(k0+1)];
                val += gout44 * dm[(j0+1)*nao+(k0+2)];
                val += gout14 * dm[(j0+2)*nao+(k0+0)];
                val += gout32 * dm[(j0+2)*nao+(k0+1)];
                val += gout50 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+1)];
                val += gout39 * dm[(j0+0)*nao+(k0+2)];
                val += gout9 * dm[(j0+1)*nao+(k0+0)];
                val += gout27 * dm[(j0+1)*nao+(k0+1)];
                val += gout45 * dm[(j0+1)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                val += gout33 * dm[(j0+2)*nao+(k0+1)];
                val += gout51 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout22 * dm[(j0+0)*nao+(k0+1)];
                val += gout40 * dm[(j0+0)*nao+(k0+2)];
                val += gout10 * dm[(j0+1)*nao+(k0+0)];
                val += gout28 * dm[(j0+1)*nao+(k0+1)];
                val += gout46 * dm[(j0+1)*nao+(k0+2)];
                val += gout16 * dm[(j0+2)*nao+(k0+0)];
                val += gout34 * dm[(j0+2)*nao+(k0+1)];
                val += gout52 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout23 * dm[(j0+0)*nao+(k0+1)];
                val += gout41 * dm[(j0+0)*nao+(k0+2)];
                val += gout11 * dm[(j0+1)*nao+(k0+0)];
                val += gout29 * dm[(j0+1)*nao+(k0+1)];
                val += gout47 * dm[(j0+1)*nao+(k0+2)];
                val += gout17 * dm[(j0+2)*nao+(k0+0)];
                val += gout35 * dm[(j0+2)*nao+(k0+1)];
                val += gout53 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout18 * dm[(i0+0)*nao+(k0+1)];
                val += gout36 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout19 * dm[(i0+1)*nao+(k0+1)];
                val += gout37 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout20 * dm[(i0+2)*nao+(k0+1)];
                val += gout38 * dm[(i0+2)*nao+(k0+2)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout21 * dm[(i0+3)*nao+(k0+1)];
                val += gout39 * dm[(i0+3)*nao+(k0+2)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout22 * dm[(i0+4)*nao+(k0+1)];
                val += gout40 * dm[(i0+4)*nao+(k0+2)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout23 * dm[(i0+5)*nao+(k0+1)];
                val += gout41 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout24 * dm[(i0+0)*nao+(k0+1)];
                val += gout42 * dm[(i0+0)*nao+(k0+2)];
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout25 * dm[(i0+1)*nao+(k0+1)];
                val += gout43 * dm[(i0+1)*nao+(k0+2)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout26 * dm[(i0+2)*nao+(k0+1)];
                val += gout44 * dm[(i0+2)*nao+(k0+2)];
                val += gout9 * dm[(i0+3)*nao+(k0+0)];
                val += gout27 * dm[(i0+3)*nao+(k0+1)];
                val += gout45 * dm[(i0+3)*nao+(k0+2)];
                val += gout10 * dm[(i0+4)*nao+(k0+0)];
                val += gout28 * dm[(i0+4)*nao+(k0+1)];
                val += gout46 * dm[(i0+4)*nao+(k0+2)];
                val += gout11 * dm[(i0+5)*nao+(k0+0)];
                val += gout29 * dm[(i0+5)*nao+(k0+1)];
                val += gout47 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(k0+0)];
                val += gout30 * dm[(i0+0)*nao+(k0+1)];
                val += gout48 * dm[(i0+0)*nao+(k0+2)];
                val += gout13 * dm[(i0+1)*nao+(k0+0)];
                val += gout31 * dm[(i0+1)*nao+(k0+1)];
                val += gout49 * dm[(i0+1)*nao+(k0+2)];
                val += gout14 * dm[(i0+2)*nao+(k0+0)];
                val += gout32 * dm[(i0+2)*nao+(k0+1)];
                val += gout50 * dm[(i0+2)*nao+(k0+2)];
                val += gout15 * dm[(i0+3)*nao+(k0+0)];
                val += gout33 * dm[(i0+3)*nao+(k0+1)];
                val += gout51 * dm[(i0+3)*nao+(k0+2)];
                val += gout16 * dm[(i0+4)*nao+(k0+0)];
                val += gout34 * dm[(i0+4)*nao+(k0+1)];
                val += gout52 * dm[(i0+4)*nao+(k0+2)];
                val += gout17 * dm[(i0+5)*nao+(k0+0)];
                val += gout35 * dm[(i0+5)*nao+(k0+1)];
                val += gout53 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout24 * dm[(j0+1)*nao+(l0+0)];
                val += gout30 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout36 * dm[(j0+0)*nao+(l0+0)];
                val += gout42 * dm[(j0+1)*nao+(l0+0)];
                val += gout48 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+1)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout25 * dm[(j0+1)*nao+(l0+0)];
                val += gout31 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout37 * dm[(j0+0)*nao+(l0+0)];
                val += gout43 * dm[(j0+1)*nao+(l0+0)];
                val += gout49 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(l0+0)];
                val += gout26 * dm[(j0+1)*nao+(l0+0)];
                val += gout32 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout38 * dm[(j0+0)*nao+(l0+0)];
                val += gout44 * dm[(j0+1)*nao+(l0+0)];
                val += gout50 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+1)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout21 * dm[(j0+0)*nao+(l0+0)];
                val += gout27 * dm[(j0+1)*nao+(l0+0)];
                val += gout33 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout39 * dm[(j0+0)*nao+(l0+0)];
                val += gout45 * dm[(j0+1)*nao+(l0+0)];
                val += gout51 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(l0+0)];
                val += gout28 * dm[(j0+1)*nao+(l0+0)];
                val += gout34 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout40 * dm[(j0+0)*nao+(l0+0)];
                val += gout46 * dm[(j0+1)*nao+(l0+0)];
                val += gout52 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                val += gout29 * dm[(j0+1)*nao+(l0+0)];
                val += gout35 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout41 * dm[(j0+0)*nao+(l0+0)];
                val += gout47 * dm[(j0+1)*nao+(l0+0)];
                val += gout53 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                val += gout21 * dm[(i0+3)*nao+(l0+0)];
                val += gout22 * dm[(i0+4)*nao+(l0+0)];
                val += gout23 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout36 * dm[(i0+0)*nao+(l0+0)];
                val += gout37 * dm[(i0+1)*nao+(l0+0)];
                val += gout38 * dm[(i0+2)*nao+(l0+0)];
                val += gout39 * dm[(i0+3)*nao+(l0+0)];
                val += gout40 * dm[(i0+4)*nao+(l0+0)];
                val += gout41 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                val += gout27 * dm[(i0+3)*nao+(l0+0)];
                val += gout28 * dm[(i0+4)*nao+(l0+0)];
                val += gout29 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout42 * dm[(i0+0)*nao+(l0+0)];
                val += gout43 * dm[(i0+1)*nao+(l0+0)];
                val += gout44 * dm[(i0+2)*nao+(l0+0)];
                val += gout45 * dm[(i0+3)*nao+(l0+0)];
                val += gout46 * dm[(i0+4)*nao+(l0+0)];
                val += gout47 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+0)];
                val += gout33 * dm[(i0+3)*nao+(l0+0)];
                val += gout34 * dm[(i0+4)*nao+(l0+0)];
                val += gout35 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout48 * dm[(i0+0)*nao+(l0+0)];
                val += gout49 * dm[(i0+1)*nao+(l0+0)];
                val += gout50 * dm[(i0+2)*nao+(l0+0)];
                val += gout51 * dm[(i0+3)*nao+(l0+0)];
                val += gout52 * dm[(i0+4)*nao+(l0+0)];
                val += gout53 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2111(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int gout_id = threadIdx.y;
    int thread_id = 32 * gout_id + sq_id;
    int threads = 256;
    constexpr int nsq_per_block = 32;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (thread_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (thread_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    constexpr int g_size = 24;

    extern __shared__ double shared_memory[];
    double *rlrk = shared_memory + sq_id;
    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
    double *akl_cache = shared_memory + nsq_per_block * 6 + sq_id;
    double *fac_ijkl = shared_memory + nsq_per_block * 8 + sq_id;
    double *gx = shared_memory + nsq_per_block * 9 + sq_id;
    double *rw = shared_memory + nsq_per_block * (g_size*3+9) + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * (g_size*3+bounds.nroots*2+9);

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double aij_cache[2];
    __shared__ double *expi;
    __shared__ double *expj;
    if (thread_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (thread_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[thread_id] = env[ri_ptr+thread_id];
        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }

    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        int iprim = bounds.iprim;
        int jprim = bounds.jprim;
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        if (gout_id == 0) {
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            rlrk[0] = xlxk;
            rlrk[32] = ylyk;
            rlrk[64] = zlzk;
            fac_ijkl[0] = fac_sym;
        }
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            __syncthreads();
            if (gout_id == 0) {
                double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
                double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
                double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
                double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
                int kp = klp / lprim;
                int lp = klp % lprim;
                double ak = expk[kp];
                double al = expl[lp];
                double akl = ak + al;
                double al_akl = al / akl;
                double xlxk = rlrk[0];
                double ylyk = rlrk[32];
                double zlzk = rlrk[64];
                double theta_kl = ak * al_akl;
                double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
                double ckcl = ck[kp] * cl[lp] * Kcd;
                double fac_sym = fac_ijkl[0];
                gx[0] = fac_sym * ckcl;
                akl_cache[0] = akl;
                akl_cache[nsq_per_block] = al_akl;
            }
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double akl = akl_cache[0];
                double al_akl = akl_cache[nsq_per_block];
                double xij = ri[0] + (rjri[0]) * aj_aij;
                double yij = ri[1] + (rjri[1]) * aj_aij;
                double zij = ri[2] + (rjri[2]) * aj_aij;
                double xkl = rk[0] + rlrk[0*nsq_per_block] * al_akl;
                double ykl = rk[1] + rlrk[1*nsq_per_block] * al_akl;
                double zkl = rk[2] + rlrk[2*nsq_per_block] * al_akl;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                if (gout_id == 0) {
                    Rpq[0*nsq_per_block] = xpq;
                    Rpq[1*nsq_per_block] = ypq;
                    Rpq[2*nsq_per_block] = zpq;
                    double cicj = cicj_cache[ijp];
                    gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
                    if (sq_id == 0) {
                        aij_cache[0] = aij;
                        aij_cache[1] = aj_aij;
                    }
                }
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                for (int irys = 0; irys < nroots; ++irys) {
                    __syncthreads();
                    double s0, s1, s2;
                    double rt = rw[irys*64];
                    double aij = aij_cache[0];
                    double rt_aa = rt / (aij + akl);
                    double akl = akl_cache[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double rt_akl = rt_aa * aij;
                    double b00 = .5 * rt_aa;
                    double b01 = .5/akl * (1 - rt_akl);
                    for (int n = gout_id; n < 3; n += 8) {
                        if (n == 2) {
                            gx[1536] = rw[irys*64+32];
                        }
                        double *_gx = gx + n * 768;
                        double xjxi = rjri[n];
                        double Rpa = xjxi * aij_cache[1];
                        double c0x = Rpa - rt_aij * Rpq[n*32];
                        s0 = _gx[0];
                        s1 = c0x * s0;
                        _gx[32] = s1;
                        s2 = c0x * s1 + 1 * b10 * s0;
                        _gx[64] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 2 * b10 * s0;
                        _gx[96] = s2;
                        double xlxk = rlrk[n*32];
                        double Rqc = xlxk * akl_cache[32];
                        double cpx = Rqc + rt_akl * Rpq[n*32];
                        s0 = _gx[0];
                        s1 = cpx * s0;
                        _gx[192] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        _gx[384] = s2;
                        s0 = _gx[32];
                        s1 = cpx * s0;
                        s1 += 1 * b00 * _gx[0];
                        _gx[224] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 1 * b00 * _gx[192];
                        _gx[416] = s2;
                        s0 = _gx[64];
                        s1 = cpx * s0;
                        s1 += 2 * b00 * _gx[32];
                        _gx[256] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 2 * b00 * _gx[224];
                        _gx[448] = s2;
                        s0 = _gx[96];
                        s1 = cpx * s0;
                        s1 += 3 * b00 * _gx[64];
                        _gx[288] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 3 * b00 * _gx[256];
                        _gx[480] = s2;
                        s1 = _gx[96];
                        s0 = _gx[64];
                        _gx[160] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[32];
                        _gx[128] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[96] = s1 - xjxi * s0;
                        s1 = _gx[288];
                        s0 = _gx[256];
                        _gx[352] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[224];
                        _gx[320] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[192];
                        _gx[288] = s1 - xjxi * s0;
                        s1 = _gx[480];
                        s0 = _gx[448];
                        _gx[544] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[416];
                        _gx[512] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[384];
                        _gx[480] = s1 - xjxi * s0;
                        s1 = _gx[384];
                        s0 = _gx[192];
                        _gx[576] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[384] = s1 - xlxk * s0;
                        s1 = _gx[416];
                        s0 = _gx[224];
                        _gx[608] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[32];
                        _gx[416] = s1 - xlxk * s0;
                        s1 = _gx[448];
                        s0 = _gx[256];
                        _gx[640] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[64];
                        _gx[448] = s1 - xlxk * s0;
                        s1 = _gx[480];
                        s0 = _gx[288];
                        _gx[672] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[96];
                        _gx[480] = s1 - xlxk * s0;
                        s1 = _gx[512];
                        s0 = _gx[320];
                        _gx[704] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[128];
                        _gx[512] = s1 - xlxk * s0;
                        s1 = _gx[544];
                        s0 = _gx[352];
                        _gx[736] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[160];
                        _gx[544] = s1 - xlxk * s0;
                    }
                    __syncthreads();
                    switch (gout_id) {
                    case 0:
                    gout0 += gx[736] * gx[768] * gx[1536];
                    gout1 += gx[608] * gx[864] * gx[1568];
                    gout2 += gx[576] * gx[800] * gx[1664];
                    gout3 += gx[448] * gx[1056] * gx[1536];
                    gout4 += gx[416] * gx[960] * gx[1664];
                    gout5 += gx[480] * gx[800] * gx[1760];
                    gout6 += gx[448] * gx[768] * gx[1824];
                    gout7 += gx[320] * gx[1152] * gx[1568];
                    gout8 += gx[192] * gx[1280] * gx[1568];
                    gout9 += gx[160] * gx[1344] * gx[1536];
                    gout10 += gx[32] * gx[1440] * gx[1568];
                    gout11 += gx[0] * gx[1376] * gx[1664];
                    gout12 += gx[64] * gx[1248] * gx[1728];
                    gout13 += gx[32] * gx[1152] * gx[1856];
                    gout14 += gx[288] * gx[800] * gx[1952];
                    gout15 += gx[256] * gx[768] * gx[2016];
                    gout16 += gx[128] * gx[960] * gx[1952];
                    gout17 += gx[0] * gx[1088] * gx[1952];
                    gout18 += gx[160] * gx[768] * gx[2112];
                    gout19 += gx[32] * gx[864] * gx[2144];
                    gout20 += gx[0] * gx[800] * gx[2240];
                    break;
                    case 1:
                    gout0 += gx[704] * gx[800] * gx[1536];
                    gout1 += gx[576] * gx[928] * gx[1536];
                    gout2 += gx[576] * gx[768] * gx[1696];
                    gout3 += gx[416] * gx[1088] * gx[1536];
                    gout4 += gx[384] * gx[1024] * gx[1632];
                    gout5 += gx[480] * gx[768] * gx[1792];
                    gout6 += gx[416] * gx[800] * gx[1824];
                    gout7 += gx[288] * gx[1216] * gx[1536];
                    gout8 += gx[192] * gx[1248] * gx[1600];
                    gout9 += gx[128] * gx[1376] * gx[1536];
                    gout10 += gx[0] * gx[1504] * gx[1536];
                    gout11 += gx[0] * gx[1344] * gx[1696];
                    gout12 += gx[32] * gx[1280] * gx[1728];
                    gout13 += gx[0] * gx[1216] * gx[1824];
                    gout14 += gx[288] * gx[768] * gx[1984];
                    gout15 += gx[224] * gx[800] * gx[2016];
                    gout16 += gx[96] * gx[1024] * gx[1920];
                    gout17 += gx[0] * gx[1056] * gx[1984];
                    gout18 += gx[128] * gx[800] * gx[2112];
                    gout19 += gx[0] * gx[928] * gx[2112];
                    gout20 += gx[0] * gx[768] * gx[2272];
                    break;
                    case 2:
                    gout0 += gx[704] * gx[768] * gx[1568];
                    gout1 += gx[576] * gx[896] * gx[1568];
                    gout2 += gx[544] * gx[960] * gx[1536];
                    gout3 += gx[416] * gx[1056] * gx[1568];
                    gout4 += gx[384] * gx[992] * gx[1664];
                    gout5 += gx[448] * gx[864] * gx[1728];
                    gout6 += gx[416] * gx[768] * gx[1856];
                    gout7 += gx[288] * gx[1184] * gx[1568];
                    gout8 += gx[256] * gx[1152] * gx[1632];
                    gout9 += gx[128] * gx[1344] * gx[1568];
                    gout10 += gx[0] * gx[1472] * gx[1568];
                    gout11 += gx[160] * gx[1152] * gx[1728];
                    gout12 += gx[32] * gx[1248] * gx[1760];
                    gout13 += gx[0] * gx[1184] * gx[1856];
                    gout14 += gx[256] * gx[864] * gx[1920];
                    gout15 += gx[224] * gx[768] * gx[2048];
                    gout16 += gx[96] * gx[992] * gx[1952];
                    gout17 += gx[64] * gx[960] * gx[2016];
                    gout18 += gx[128] * gx[768] * gx[2144];
                    gout19 += gx[0] * gx[896] * gx[2144];
                    break;
                    case 3:
                    gout0 += gx[672] * gx[832] * gx[1536];
                    gout1 += gx[576] * gx[864] * gx[1600];
                    gout2 += gx[512] * gx[992] * gx[1536];
                    gout3 += gx[384] * gx[1120] * gx[1536];
                    gout4 += gx[384] * gx[960] * gx[1696];
                    gout5 += gx[416] * gx[896] * gx[1728];
                    gout6 += gx[384] * gx[832] * gx[1824];
                    gout7 += gx[288] * gx[1152] * gx[1600];
                    gout8 += gx[224] * gx[1184] * gx[1632];
                    gout9 += gx[96] * gx[1408] * gx[1536];
                    gout10 += gx[0] * gx[1440] * gx[1600];
                    gout11 += gx[128] * gx[1184] * gx[1728];
                    gout12 += gx[0] * gx[1312] * gx[1728];
                    gout13 += gx[0] * gx[1152] * gx[1888];
                    gout14 += gx[224] * gx[896] * gx[1920];
                    gout15 += gx[192] * gx[832] * gx[2016];
                    gout16 += gx[96] * gx[960] * gx[1984];
                    gout17 += gx[32] * gx[992] * gx[2016];
                    gout18 += gx[96] * gx[832] * gx[2112];
                    gout19 += gx[0] * gx[864] * gx[2176];
                    break;
                    case 4:
                    gout0 += gx[672] * gx[800] * gx[1568];
                    gout1 += gx[640] * gx[768] * gx[1632];
                    gout2 += gx[512] * gx[960] * gx[1568];
                    gout3 += gx[384] * gx[1088] * gx[1568];
                    gout4 += gx[544] * gx[768] * gx[1728];
                    gout5 += gx[416] * gx[864] * gx[1760];
                    gout6 += gx[384] * gx[800] * gx[1856];
                    gout7 += gx[256] * gx[1248] * gx[1536];
                    gout8 += gx[224] * gx[1152] * gx[1664];
                    gout9 += gx[96] * gx[1376] * gx[1568];
                    gout10 += gx[64] * gx[1344] * gx[1632];
                    gout11 += gx[128] * gx[1152] * gx[1760];
                    gout12 += gx[0] * gx[1280] * gx[1760];
                    gout13 += gx[352] * gx[768] * gx[1920];
                    gout14 += gx[224] * gx[864] * gx[1952];
                    gout15 += gx[192] * gx[800] * gx[2048];
                    gout16 += gx[64] * gx[1056] * gx[1920];
                    gout17 += gx[32] * gx[960] * gx[2048];
                    gout18 += gx[96] * gx[800] * gx[2144];
                    gout19 += gx[64] * gx[768] * gx[2208];
                    break;
                    case 5:
                    gout0 += gx[672] * gx[768] * gx[1600];
                    gout1 += gx[608] * gx[800] * gx[1632];
                    gout2 += gx[480] * gx[1024] * gx[1536];
                    gout3 += gx[384] * gx[1056] * gx[1600];
                    gout4 += gx[512] * gx[800] * gx[1728];
                    gout5 += gx[384] * gx[928] * gx[1728];
                    gout6 += gx[384] * gx[768] * gx[1888];
                    gout7 += gx[224] * gx[1280] * gx[1536];
                    gout8 += gx[192] * gx[1216] * gx[1632];
                    gout9 += gx[96] * gx[1344] * gx[1600];
                    gout10 += gx[32] * gx[1376] * gx[1632];
                    gout11 += gx[96] * gx[1216] * gx[1728];
                    gout12 += gx[0] * gx[1248] * gx[1792];
                    gout13 += gx[320] * gx[800] * gx[1920];
                    gout14 += gx[192] * gx[928] * gx[1920];
                    gout15 += gx[192] * gx[768] * gx[2080];
                    gout16 += gx[32] * gx[1088] * gx[1920];
                    gout17 += gx[0] * gx[1024] * gx[2016];
                    gout18 += gx[96] * gx[768] * gx[2176];
                    gout19 += gx[32] * gx[800] * gx[2208];
                    break;
                    case 6:
                    gout0 += gx[640] * gx[864] * gx[1536];
                    gout1 += gx[608] * gx[768] * gx[1664];
                    gout2 += gx[480] * gx[992] * gx[1568];
                    gout3 += gx[448] * gx[960] * gx[1632];
                    gout4 += gx[512] * gx[768] * gx[1760];
                    gout5 += gx[384] * gx[896] * gx[1760];
                    gout6 += gx[352] * gx[1152] * gx[1536];
                    gout7 += gx[224] * gx[1248] * gx[1568];
                    gout8 += gx[192] * gx[1184] * gx[1664];
                    gout9 += gx[64] * gx[1440] * gx[1536];
                    gout10 += gx[32] * gx[1344] * gx[1664];
                    gout11 += gx[96] * gx[1184] * gx[1760];
                    gout12 += gx[64] * gx[1152] * gx[1824];
                    gout13 += gx[320] * gx[768] * gx[1952];
                    gout14 += gx[192] * gx[896] * gx[1952];
                    gout15 += gx[160] * gx[960] * gx[1920];
                    gout16 += gx[32] * gx[1056] * gx[1952];
                    gout17 += gx[0] * gx[992] * gx[2048];
                    gout18 += gx[64] * gx[864] * gx[2112];
                    gout19 += gx[32] * gx[768] * gx[2240];
                    break;
                    case 7:
                    gout0 += gx[608] * gx[896] * gx[1536];
                    gout1 += gx[576] * gx[832] * gx[1632];
                    gout2 += gx[480] * gx[960] * gx[1600];
                    gout3 += gx[416] * gx[992] * gx[1632];
                    gout4 += gx[480] * gx[832] * gx[1728];
                    gout5 += gx[384] * gx[864] * gx[1792];
                    gout6 += gx[320] * gx[1184] * gx[1536];
                    gout7 += gx[192] * gx[1312] * gx[1536];
                    gout8 += gx[192] * gx[1152] * gx[1696];
                    gout9 += gx[32] * gx[1472] * gx[1536];
                    gout10 += gx[0] * gx[1408] * gx[1632];
                    gout11 += gx[96] * gx[1152] * gx[1792];
                    gout12 += gx[32] * gx[1184] * gx[1824];
                    gout13 += gx[288] * gx[832] * gx[1920];
                    gout14 += gx[192] * gx[864] * gx[1984];
                    gout15 += gx[128] * gx[992] * gx[1920];
                    gout16 += gx[0] * gx[1120] * gx[1920];
                    gout17 += gx[0] * gx[960] * gx[2080];
                    gout18 += gx[32] * gx[896] * gx[2112];
                    gout19 += gx[0] * gx[832] * gx[2208];
                    break;
                    }
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                switch (gout_id) {
                case 0:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout4 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout13 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+2)];
                val += gout2 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+1)*nao+(k0+1)];
                val += gout20 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout5 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(k0+1)];
                val += gout7 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(k0+2)];
                val += gout16 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+1)];
                val += gout1 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout10 * dm[(i0+2)*nao+(k0+1)];
                val += gout8 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+2)*nao+(k0+2)];
                val += gout17 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+2)];
                val += gout4 * dm[(i0+2)*nao+(k0+1)];
                val += gout2 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout13 * dm[(i0+2)*nao+(k0+2)];
                val += gout11 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(k0+0)];
                val += gout20 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+1)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+1)];
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                val += gout10 * dm[(j0+1)*nao+(l0+1)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+2)];
                val += gout13 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+2)];
                val += gout8 * dm[(j0+1)*nao+(l0+1)];
                val += gout2 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(j0+1)*nao+(l0+2)];
                val += gout11 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+2)*nao+(l0+1)];
                val += gout14 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(l0+1)];
                val += gout16 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+2)];
                val += gout5 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+2)*nao+(l0+0)];
                val += gout8 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout10 * dm[(i0+2)*nao+(l0+1)];
                val += gout17 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+1)];
                val += gout19 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+2)];
                val += gout2 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(i0+2)*nao+(l0+0)];
                val += gout11 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+2)*nao+(l0+1)];
                val += gout20 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 1:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout4 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout13 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+2)];
                val += gout2 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+1)*nao+(k0+1)];
                val += gout20 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(k0+1)];
                val += gout7 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(k0+2)];
                val += gout16 * dm[(i0+3)*nao+(k0+1)];
                val += gout14 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(k0+1)];
                val += gout1 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(k0+2)];
                val += gout10 * dm[(i0+3)*nao+(k0+1)];
                val += gout8 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+3)*nao+(k0+2)];
                val += gout17 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(k0+2)];
                val += gout4 * dm[(i0+3)*nao+(k0+1)];
                val += gout2 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout13 * dm[(i0+3)*nao+(k0+2)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(k0+0)];
                val += gout20 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+1)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+1)];
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                val += gout10 * dm[(j0+1)*nao+(l0+1)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+2)];
                val += gout13 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+2)];
                val += gout8 * dm[(j0+1)*nao+(l0+1)];
                val += gout2 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(j0+1)*nao+(l0+2)];
                val += gout11 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout7 * dm[(i0+3)*nao+(l0+1)];
                val += gout14 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(l0+1)];
                val += gout16 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(l0+2)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+3)*nao+(l0+0)];
                val += gout8 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(l0+0)];
                val += gout10 * dm[(i0+3)*nao+(l0+1)];
                val += gout17 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(l0+1)];
                val += gout19 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(l0+2)];
                val += gout2 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(i0+3)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(l0+0)];
                val += gout13 * dm[(i0+3)*nao+(l0+1)];
                val += gout20 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 2:
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout4 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout13 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(i0+0)*nao+(k0+1)];
                val += gout0 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(i0+0)*nao+(k0+2)];
                val += gout9 * dm[(i0+2)*nao+(k0+1)];
                val += gout7 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+2)*nao+(k0+2)];
                val += gout16 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(i0+0)*nao+(k0+2)];
                val += gout3 * dm[(i0+2)*nao+(k0+1)];
                val += gout1 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+2)*nao+(k0+2)];
                val += gout10 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(i0+0)*nao+(k0+0)];
                val += gout19 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(i0+2)*nao+(k0+2)];
                val += gout4 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(i0+0)*nao+(k0+0)];
                val += gout13 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(i0+0)*nao+(k0+1)];
                val += gout15 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+1)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+1)];
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                val += gout10 * dm[(j0+1)*nao+(l0+1)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+2)];
                val += gout13 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+2)*nao+(l0+0)];
                val += gout7 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+0)*nao+(l0+0)];
                val += gout9 * dm[(i0+2)*nao+(l0+1)];
                val += gout16 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(i0+0)*nao+(l0+1)];
                val += gout18 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(i0+0)*nao+(l0+2)];
                val += gout1 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+2)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(i0+0)*nao+(l0+0)];
                val += gout12 * dm[(i0+2)*nao+(l0+1)];
                val += gout19 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(i0+0)*nao+(l0+1)];
                val += gout15 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(i0+0)*nao+(l0+2)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(i0+2)*nao+(l0+0)];
                val += gout13 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 3:
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout4 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout13 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(i0+1)*nao+(k0+1)];
                val += gout0 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(i0+1)*nao+(k0+2)];
                val += gout9 * dm[(i0+3)*nao+(k0+1)];
                val += gout7 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+3)*nao+(k0+2)];
                val += gout16 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(i0+1)*nao+(k0+2)];
                val += gout3 * dm[(i0+3)*nao+(k0+1)];
                val += gout1 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+3)*nao+(k0+2)];
                val += gout10 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(i0+1)*nao+(k0+0)];
                val += gout19 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(i0+3)*nao+(k0+2)];
                val += gout4 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(i0+1)*nao+(k0+0)];
                val += gout13 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(i0+1)*nao+(k0+1)];
                val += gout15 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+1)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+1)];
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                val += gout10 * dm[(j0+1)*nao+(l0+1)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+2)];
                val += gout13 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+3)*nao+(l0+0)];
                val += gout7 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+1)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+1)];
                val += gout16 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(i0+1)*nao+(l0+1)];
                val += gout18 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(i0+1)*nao+(l0+2)];
                val += gout1 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout10 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(i0+1)*nao+(l0+0)];
                val += gout12 * dm[(i0+3)*nao+(l0+1)];
                val += gout19 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(i0+1)*nao+(l0+1)];
                val += gout15 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(i0+1)*nao+(l0+2)];
                val += gout4 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(i0+3)*nao+(l0+0)];
                val += gout13 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 4:
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+2)];
                val += gout1 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+1)*nao+(k0+1)];
                val += gout19 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+1)];
                val += gout0 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(i0+2)*nao+(k0+2)];
                val += gout9 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout13 * dm[(i0+0)*nao+(k0+0)];
                val += gout18 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(k0+2)];
                val += gout3 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(i0+0)*nao+(k0+1)];
                val += gout14 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(k0+0)];
                val += gout6 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+0)*nao+(k0+2)];
                val += gout17 * dm[(i0+2)*nao+(k0+1)];
                val += gout15 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+2)];
                val += gout7 * dm[(j0+1)*nao+(l0+1)];
                val += gout1 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+1)*nao+(l0+2)];
                val += gout10 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+1)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(i0+0)*nao+(l0+2)];
                val += gout0 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+2)*nao+(l0+1)];
                val += gout18 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(l0+1)];
                val += gout14 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(i0+0)*nao+(l0+2)];
                val += gout3 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                val += gout12 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+1)];
                val += gout15 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+1)];
                val += gout17 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(i0+0)*nao+(l0+2)];
                val += gout6 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 5:
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+2)];
                val += gout1 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+1)*nao+(k0+1)];
                val += gout19 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout3 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout12 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+3)*nao+(k0+1)];
                val += gout0 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(i0+3)*nao+(k0+2)];
                val += gout9 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout13 * dm[(i0+1)*nao+(k0+0)];
                val += gout18 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(k0+2)];
                val += gout3 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(i0+1)*nao+(k0+1)];
                val += gout14 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout6 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout8 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+1)*nao+(k0+2)];
                val += gout17 * dm[(i0+3)*nao+(k0+1)];
                val += gout15 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+2)];
                val += gout7 * dm[(j0+1)*nao+(l0+1)];
                val += gout1 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+1)*nao+(l0+2)];
                val += gout10 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                val += gout12 * dm[(j0+1)*nao+(l0+1)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(i0+1)*nao+(l0+2)];
                val += gout0 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+3)*nao+(l0+0)];
                val += gout9 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout11 * dm[(i0+3)*nao+(l0+1)];
                val += gout18 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(l0+1)];
                val += gout14 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(i0+1)*nao+(l0+2)];
                val += gout3 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(l0+0)];
                val += gout12 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+3)*nao+(l0+1)];
                val += gout15 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(l0+1)];
                val += gout17 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(i0+1)*nao+(l0+2)];
                val += gout6 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 6:
                val = 0;
                val += gout0 * dm[(j0+1)*nao+(k0+0)];
                val += gout3 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+1)*nao+(k0+1)];
                val += gout12 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+2)];
                val += gout1 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+1)*nao+(k0+1)];
                val += gout19 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(i0+2)*nao+(k0+2)];
                val += gout2 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout11 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(k0+1)];
                val += gout13 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout5 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(k0+1)];
                val += gout7 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(k0+2)];
                val += gout16 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+1)];
                val += gout1 * dm[(i0+2)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout10 * dm[(i0+2)*nao+(k0+1)];
                val += gout8 * dm[(i0+4)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+2)*nao+(k0+2)];
                val += gout17 * dm[(i0+4)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+1)];
                val += gout0 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+2)];
                val += gout9 * dm[(j0+1)*nao+(l0+1)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+1)*nao+(l0+2)];
                val += gout12 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+2)];
                val += gout7 * dm[(j0+1)*nao+(l0+1)];
                val += gout1 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+1)*nao+(l0+2)];
                val += gout10 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+1)];
                val += gout13 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+2)];
                val += gout2 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(i0+2)*nao+(l0+0)];
                val += gout11 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+2)*nao+(l0+1)];
                val += gout14 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(l0+1)];
                val += gout16 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+2)];
                val += gout5 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+2)*nao+(l0+0)];
                val += gout8 * dm[(i0+4)*nao+(l0+1)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout10 * dm[(i0+2)*nao+(l0+1)];
                val += gout17 * dm[(i0+4)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+1)];
                val += gout19 * dm[(i0+2)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 7:
                val = 0;
                val += gout0 * dm[(j0+1)*nao+(k0+0)];
                val += gout3 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+1)*nao+(k0+1)];
                val += gout12 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+2)];
                val += gout1 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+1)*nao+(k0+1)];
                val += gout19 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                val += gout5 * dm[(j0+1)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+2)];
                val += gout8 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(i0+3)*nao+(k0+2)];
                val += gout2 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(k0+1)];
                val += gout13 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(k0+1)];
                val += gout7 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(k0+2)];
                val += gout16 * dm[(i0+3)*nao+(k0+1)];
                val += gout14 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(k0+1)];
                val += gout1 * dm[(i0+3)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(k0+2)];
                val += gout10 * dm[(i0+3)*nao+(k0+1)];
                val += gout8 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(i0+3)*nao+(k0+2)];
                val += gout17 * dm[(i0+5)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+1)];
                val += gout0 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+2)];
                val += gout9 * dm[(j0+1)*nao+(l0+1)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+1)*nao+(l0+2)];
                val += gout12 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+2)];
                val += gout7 * dm[(j0+1)*nao+(l0+1)];
                val += gout1 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+1)*nao+(l0+2)];
                val += gout10 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+1)*nao+(l0+2)];
                val += gout8 * dm[(j0+2)*nao+(l0+1)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                val += gout5 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(l0+1)];
                val += gout13 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(l0+2)];
                val += gout2 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout4 * dm[(i0+3)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout7 * dm[(i0+3)*nao+(l0+1)];
                val += gout14 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(l0+1)];
                val += gout16 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(l0+2)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+3)*nao+(l0+0)];
                val += gout8 * dm[(i0+5)*nao+(l0+1)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(l0+0)];
                val += gout10 * dm[(i0+3)*nao+(l0+1)];
                val += gout17 * dm[(i0+5)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(l0+1)];
                val += gout19 * dm[(i0+3)*nao+(l0+2)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                }
            }
        }
    }
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2120(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int gout_id = threadIdx.y;
    int thread_id = 64 * gout_id + sq_id;
    int threads = 256;
    constexpr int nsq_per_block = 64;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (thread_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (thread_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    constexpr int g_size = 18;

    extern __shared__ double shared_memory[];
    double *rlrk = shared_memory + sq_id;
    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
    double *akl_cache = shared_memory + nsq_per_block * 6 + sq_id;
    double *fac_ijkl = shared_memory + nsq_per_block * 8 + sq_id;
    double *gx = shared_memory + nsq_per_block * 9 + sq_id;
    double *rw = shared_memory + nsq_per_block * (g_size*3+9) + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * (g_size*3+bounds.nroots*2+9);

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double aij_cache[2];
    __shared__ double *expi;
    __shared__ double *expj;
    if (thread_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (thread_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[thread_id] = env[ri_ptr+thread_id];
        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }

    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        int iprim = bounds.iprim;
        int jprim = bounds.jprim;
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        if (gout_id == 0) {
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            rlrk[0] = xlxk;
            rlrk[64] = ylyk;
            rlrk[128] = zlzk;
            fac_ijkl[0] = fac_sym;
        }
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            __syncthreads();
            if (gout_id == 0) {
                double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
                double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
                double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
                double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
                int kp = klp / lprim;
                int lp = klp % lprim;
                double ak = expk[kp];
                double al = expl[lp];
                double akl = ak + al;
                double al_akl = al / akl;
                double xlxk = rlrk[0];
                double ylyk = rlrk[64];
                double zlzk = rlrk[128];
                double theta_kl = ak * al_akl;
                double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
                double ckcl = ck[kp] * cl[lp] * Kcd;
                double fac_sym = fac_ijkl[0];
                gx[0] = fac_sym * ckcl;
                akl_cache[0] = akl;
                akl_cache[nsq_per_block] = al_akl;
            }
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double akl = akl_cache[0];
                double al_akl = akl_cache[nsq_per_block];
                double xij = ri[0] + (rjri[0]) * aj_aij;
                double yij = ri[1] + (rjri[1]) * aj_aij;
                double zij = ri[2] + (rjri[2]) * aj_aij;
                double xkl = rk[0] + rlrk[0*nsq_per_block] * al_akl;
                double ykl = rk[1] + rlrk[1*nsq_per_block] * al_akl;
                double zkl = rk[2] + rlrk[2*nsq_per_block] * al_akl;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                if (gout_id == 0) {
                    Rpq[0*nsq_per_block] = xpq;
                    Rpq[1*nsq_per_block] = ypq;
                    Rpq[2*nsq_per_block] = zpq;
                    double cicj = cicj_cache[ijp];
                    gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
                    if (sq_id == 0) {
                        aij_cache[0] = aij;
                        aij_cache[1] = aj_aij;
                    }
                }
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                for (int irys = 0; irys < nroots; ++irys) {
                    __syncthreads();
                    double s0, s1, s2;
                    double rt = rw[irys*128];
                    double aij = aij_cache[0];
                    double rt_aa = rt / (aij + akl);
                    double akl = akl_cache[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double rt_akl = rt_aa * aij;
                    double b00 = .5 * rt_aa;
                    double b01 = .5/akl * (1 - rt_akl);
                    for (int n = gout_id; n < 3; n += 4) {
                        if (n == 2) {
                            gx[2304] = rw[irys*128+64];
                        }
                        double *_gx = gx + n * 1152;
                        double xjxi = rjri[n];
                        double Rpa = xjxi * aij_cache[1];
                        double c0x = Rpa - rt_aij * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = c0x * s0;
                        _gx[64] = s1;
                        s2 = c0x * s1 + 1 * b10 * s0;
                        _gx[128] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 2 * b10 * s0;
                        _gx[192] = s2;
                        double xlxk = rlrk[n*64];
                        double Rqc = xlxk * akl_cache[64];
                        double cpx = Rqc + rt_akl * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = cpx * s0;
                        _gx[384] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        _gx[768] = s2;
                        s0 = _gx[64];
                        s1 = cpx * s0;
                        s1 += 1 * b00 * _gx[0];
                        _gx[448] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 1 * b00 * _gx[384];
                        _gx[832] = s2;
                        s0 = _gx[128];
                        s1 = cpx * s0;
                        s1 += 2 * b00 * _gx[64];
                        _gx[512] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 2 * b00 * _gx[448];
                        _gx[896] = s2;
                        s0 = _gx[192];
                        s1 = cpx * s0;
                        s1 += 3 * b00 * _gx[128];
                        _gx[576] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 3 * b00 * _gx[512];
                        _gx[960] = s2;
                        s1 = _gx[192];
                        s0 = _gx[128];
                        _gx[320] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[64];
                        _gx[256] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[192] = s1 - xjxi * s0;
                        s1 = _gx[576];
                        s0 = _gx[512];
                        _gx[704] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[448];
                        _gx[640] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[384];
                        _gx[576] = s1 - xjxi * s0;
                        s1 = _gx[960];
                        s0 = _gx[896];
                        _gx[1088] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[832];
                        _gx[1024] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[768];
                        _gx[960] = s1 - xjxi * s0;
                    }
                    __syncthreads();
                    switch (gout_id) {
                    case 0:
                    gout0 += gx[1088] * gx[1152] * gx[2304];
                    gout1 += gx[960] * gx[1216] * gx[2368];
                    gout2 += gx[832] * gx[1344] * gx[2368];
                    gout3 += gx[896] * gx[1152] * gx[2496];
                    gout4 += gx[768] * gx[1216] * gx[2560];
                    gout5 += gx[640] * gx[1536] * gx[2368];
                    gout6 += gx[512] * gx[1728] * gx[2304];
                    gout7 += gx[384] * gx[1792] * gx[2368];
                    gout8 += gx[448] * gx[1536] * gx[2560];
                    gout9 += gx[704] * gx[1152] * gx[2688];
                    gout10 += gx[576] * gx[1216] * gx[2752];
                    gout11 += gx[448] * gx[1344] * gx[2752];
                    gout12 += gx[512] * gx[1152] * gx[2880];
                    gout13 += gx[384] * gx[1216] * gx[2944];
                    gout14 += gx[256] * gx[1920] * gx[2368];
                    gout15 += gx[128] * gx[2112] * gx[2304];
                    gout16 += gx[0] * gx[2176] * gx[2368];
                    gout17 += gx[64] * gx[1920] * gx[2560];
                    gout18 += gx[320] * gx[1536] * gx[2688];
                    gout19 += gx[192] * gx[1600] * gx[2752];
                    gout20 += gx[64] * gx[1728] * gx[2752];
                    gout21 += gx[128] * gx[1536] * gx[2880];
                    gout22 += gx[0] * gx[1600] * gx[2944];
                    gout23 += gx[256] * gx[1152] * gx[3136];
                    gout24 += gx[128] * gx[1344] * gx[3072];
                    gout25 += gx[0] * gx[1408] * gx[3136];
                    gout26 += gx[64] * gx[1152] * gx[3328];
                    break;
                    case 1:
                    gout0 += gx[1024] * gx[1216] * gx[2304];
                    gout1 += gx[960] * gx[1152] * gx[2432];
                    gout2 += gx[768] * gx[1472] * gx[2304];
                    gout3 += gx[832] * gx[1216] * gx[2496];
                    gout4 += gx[768] * gx[1152] * gx[2624];
                    gout5 += gx[576] * gx[1664] * gx[2304];
                    gout6 += gx[448] * gx[1792] * gx[2304];
                    gout7 += gx[384] * gx[1728] * gx[2432];
                    gout8 += gx[384] * gx[1664] * gx[2496];
                    gout9 += gx[640] * gx[1216] * gx[2688];
                    gout10 += gx[576] * gx[1152] * gx[2816];
                    gout11 += gx[384] * gx[1472] * gx[2688];
                    gout12 += gx[448] * gx[1216] * gx[2880];
                    gout13 += gx[384] * gx[1152] * gx[3008];
                    gout14 += gx[192] * gx[2048] * gx[2304];
                    gout15 += gx[64] * gx[2176] * gx[2304];
                    gout16 += gx[0] * gx[2112] * gx[2432];
                    gout17 += gx[0] * gx[2048] * gx[2496];
                    gout18 += gx[256] * gx[1600] * gx[2688];
                    gout19 += gx[192] * gx[1536] * gx[2816];
                    gout20 += gx[0] * gx[1856] * gx[2688];
                    gout21 += gx[64] * gx[1600] * gx[2880];
                    gout22 += gx[0] * gx[1536] * gx[3008];
                    gout23 += gx[192] * gx[1280] * gx[3072];
                    gout24 += gx[64] * gx[1408] * gx[3072];
                    gout25 += gx[0] * gx[1344] * gx[3200];
                    gout26 += gx[0] * gx[1280] * gx[3264];
                    break;
                    case 2:
                    gout0 += gx[1024] * gx[1152] * gx[2368];
                    gout1 += gx[896] * gx[1344] * gx[2304];
                    gout2 += gx[768] * gx[1408] * gx[2368];
                    gout3 += gx[832] * gx[1152] * gx[2560];
                    gout4 += gx[704] * gx[1536] * gx[2304];
                    gout5 += gx[576] * gx[1600] * gx[2368];
                    gout6 += gx[448] * gx[1728] * gx[2368];
                    gout7 += gx[512] * gx[1536] * gx[2496];
                    gout8 += gx[384] * gx[1600] * gx[2560];
                    gout9 += gx[640] * gx[1152] * gx[2752];
                    gout10 += gx[512] * gx[1344] * gx[2688];
                    gout11 += gx[384] * gx[1408] * gx[2752];
                    gout12 += gx[448] * gx[1152] * gx[2944];
                    gout13 += gx[320] * gx[1920] * gx[2304];
                    gout14 += gx[192] * gx[1984] * gx[2368];
                    gout15 += gx[64] * gx[2112] * gx[2368];
                    gout16 += gx[128] * gx[1920] * gx[2496];
                    gout17 += gx[0] * gx[1984] * gx[2560];
                    gout18 += gx[256] * gx[1536] * gx[2752];
                    gout19 += gx[128] * gx[1728] * gx[2688];
                    gout20 += gx[0] * gx[1792] * gx[2752];
                    gout21 += gx[64] * gx[1536] * gx[2944];
                    gout22 += gx[320] * gx[1152] * gx[3072];
                    gout23 += gx[192] * gx[1216] * gx[3136];
                    gout24 += gx[64] * gx[1344] * gx[3136];
                    gout25 += gx[128] * gx[1152] * gx[3264];
                    gout26 += gx[0] * gx[1216] * gx[3328];
                    break;
                    case 3:
                    gout0 += gx[960] * gx[1280] * gx[2304];
                    gout1 += gx[832] * gx[1408] * gx[2304];
                    gout2 += gx[768] * gx[1344] * gx[2432];
                    gout3 += gx[768] * gx[1280] * gx[2496];
                    gout4 += gx[640] * gx[1600] * gx[2304];
                    gout5 += gx[576] * gx[1536] * gx[2432];
                    gout6 += gx[384] * gx[1856] * gx[2304];
                    gout7 += gx[448] * gx[1600] * gx[2496];
                    gout8 += gx[384] * gx[1536] * gx[2624];
                    gout9 += gx[576] * gx[1280] * gx[2688];
                    gout10 += gx[448] * gx[1408] * gx[2688];
                    gout11 += gx[384] * gx[1344] * gx[2816];
                    gout12 += gx[384] * gx[1280] * gx[2880];
                    gout13 += gx[256] * gx[1984] * gx[2304];
                    gout14 += gx[192] * gx[1920] * gx[2432];
                    gout15 += gx[0] * gx[2240] * gx[2304];
                    gout16 += gx[64] * gx[1984] * gx[2496];
                    gout17 += gx[0] * gx[1920] * gx[2624];
                    gout18 += gx[192] * gx[1664] * gx[2688];
                    gout19 += gx[64] * gx[1792] * gx[2688];
                    gout20 += gx[0] * gx[1728] * gx[2816];
                    gout21 += gx[0] * gx[1664] * gx[2880];
                    gout22 += gx[256] * gx[1216] * gx[3072];
                    gout23 += gx[192] * gx[1152] * gx[3200];
                    gout24 += gx[0] * gx[1472] * gx[3072];
                    gout25 += gx[64] * gx[1216] * gx[3264];
                    gout26 += gx[0] * gx[1152] * gx[3392];
                    break;
                    }
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                switch (gout_id) {
                case 0:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+2)];
                val += gout18 * dm[(j0+0)*nao+(k0+4)];
                val += gout6 * dm[(j0+1)*nao+(k0+1)];
                val += gout15 * dm[(j0+1)*nao+(k0+3)];
                val += gout24 * dm[(j0+1)*nao+(k0+5)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+2)];
                val += gout21 * dm[(j0+2)*nao+(k0+4)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout23 * dm[(j0+0)*nao+(k0+5)];
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+2)];
                val += gout20 * dm[(j0+1)*nao+(k0+4)];
                val += gout8 * dm[(j0+2)*nao+(k0+1)];
                val += gout17 * dm[(j0+2)*nao+(k0+3)];
                val += gout26 * dm[(j0+2)*nao+(k0+5)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+2)];
                val += gout19 * dm[(j0+0)*nao+(k0+4)];
                val += gout7 * dm[(j0+1)*nao+(k0+1)];
                val += gout16 * dm[(j0+1)*nao+(k0+3)];
                val += gout25 * dm[(j0+1)*nao+(k0+5)];
                val += gout4 * dm[(j0+2)*nao+(k0+0)];
                val += gout13 * dm[(j0+2)*nao+(k0+2)];
                val += gout22 * dm[(j0+2)*nao+(k0+4)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout9 * dm[(i0+0)*nao+(k0+2)];
                val += gout18 * dm[(i0+0)*nao+(k0+4)];
                val += gout5 * dm[(i0+2)*nao+(k0+1)];
                val += gout14 * dm[(i0+2)*nao+(k0+3)];
                val += gout23 * dm[(i0+2)*nao+(k0+5)];
                val += gout1 * dm[(i0+4)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+2)];
                val += gout19 * dm[(i0+4)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+1)];
                val += gout15 * dm[(i0+0)*nao+(k0+3)];
                val += gout24 * dm[(i0+0)*nao+(k0+5)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout11 * dm[(i0+2)*nao+(k0+2)];
                val += gout20 * dm[(i0+2)*nao+(k0+4)];
                val += gout7 * dm[(i0+4)*nao+(k0+1)];
                val += gout16 * dm[(i0+4)*nao+(k0+3)];
                val += gout25 * dm[(i0+4)*nao+(k0+5)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout21 * dm[(i0+0)*nao+(k0+4)];
                val += gout8 * dm[(i0+2)*nao+(k0+1)];
                val += gout17 * dm[(i0+2)*nao+(k0+3)];
                val += gout26 * dm[(i0+2)*nao+(k0+5)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout13 * dm[(i0+4)*nao+(k0+2)];
                val += gout22 * dm[(i0+4)*nao+(k0+4)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+3), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+4), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout16 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+3), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+4), val);
                val = 0;
                val += gout25 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout19 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout23 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+3), val);
                val = 0;
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout25 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+5), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout17 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+3), val);
                val = 0;
                val += gout21 * dm[(i0+0)*nao+(l0+0)];
                val += gout22 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+4), val);
                val = 0;
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+5), val);
                break;
                case 1:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+2)];
                val += gout18 * dm[(j0+0)*nao+(k0+4)];
                val += gout6 * dm[(j0+1)*nao+(k0+1)];
                val += gout15 * dm[(j0+1)*nao+(k0+3)];
                val += gout24 * dm[(j0+1)*nao+(k0+5)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+2)];
                val += gout21 * dm[(j0+2)*nao+(k0+4)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout23 * dm[(j0+0)*nao+(k0+5)];
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+2)];
                val += gout20 * dm[(j0+1)*nao+(k0+4)];
                val += gout8 * dm[(j0+2)*nao+(k0+1)];
                val += gout17 * dm[(j0+2)*nao+(k0+3)];
                val += gout26 * dm[(j0+2)*nao+(k0+5)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+2)];
                val += gout19 * dm[(j0+0)*nao+(k0+4)];
                val += gout7 * dm[(j0+1)*nao+(k0+1)];
                val += gout16 * dm[(j0+1)*nao+(k0+3)];
                val += gout25 * dm[(j0+1)*nao+(k0+5)];
                val += gout4 * dm[(j0+2)*nao+(k0+0)];
                val += gout13 * dm[(j0+2)*nao+(k0+2)];
                val += gout22 * dm[(j0+2)*nao+(k0+4)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout9 * dm[(i0+1)*nao+(k0+2)];
                val += gout18 * dm[(i0+1)*nao+(k0+4)];
                val += gout5 * dm[(i0+3)*nao+(k0+1)];
                val += gout14 * dm[(i0+3)*nao+(k0+3)];
                val += gout23 * dm[(i0+3)*nao+(k0+5)];
                val += gout1 * dm[(i0+5)*nao+(k0+0)];
                val += gout10 * dm[(i0+5)*nao+(k0+2)];
                val += gout19 * dm[(i0+5)*nao+(k0+4)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(k0+1)];
                val += gout15 * dm[(i0+1)*nao+(k0+3)];
                val += gout24 * dm[(i0+1)*nao+(k0+5)];
                val += gout2 * dm[(i0+3)*nao+(k0+0)];
                val += gout11 * dm[(i0+3)*nao+(k0+2)];
                val += gout20 * dm[(i0+3)*nao+(k0+4)];
                val += gout7 * dm[(i0+5)*nao+(k0+1)];
                val += gout16 * dm[(i0+5)*nao+(k0+3)];
                val += gout25 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+1)*nao+(k0+2)];
                val += gout21 * dm[(i0+1)*nao+(k0+4)];
                val += gout8 * dm[(i0+3)*nao+(k0+1)];
                val += gout17 * dm[(i0+3)*nao+(k0+3)];
                val += gout26 * dm[(i0+3)*nao+(k0+5)];
                val += gout4 * dm[(i0+5)*nao+(k0+0)];
                val += gout13 * dm[(i0+5)*nao+(k0+2)];
                val += gout22 * dm[(i0+5)*nao+(k0+4)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+3), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+4), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout16 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+3), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+4), val);
                val = 0;
                val += gout25 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout1 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(l0+0)];
                val += gout10 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(l0+0)];
                val += gout19 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout23 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(l0+0)];
                val += gout7 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(l0+0)];
                val += gout16 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+3), val);
                val = 0;
                val += gout20 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(i0+1)*nao+(l0+0)];
                val += gout25 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+5), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(l0+0)];
                val += gout4 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(l0+0)];
                val += gout13 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout17 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+3), val);
                val = 0;
                val += gout21 * dm[(i0+1)*nao+(l0+0)];
                val += gout22 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+4), val);
                val = 0;
                val += gout26 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+5), val);
                break;
                case 2:
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+3)];
                val += gout22 * dm[(j0+0)*nao+(k0+5)];
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+2)];
                val += gout19 * dm[(j0+1)*nao+(k0+4)];
                val += gout7 * dm[(j0+2)*nao+(k0+1)];
                val += gout16 * dm[(j0+2)*nao+(k0+3)];
                val += gout25 * dm[(j0+2)*nao+(k0+5)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+2)];
                val += gout18 * dm[(j0+0)*nao+(k0+4)];
                val += gout6 * dm[(j0+1)*nao+(k0+1)];
                val += gout15 * dm[(j0+1)*nao+(k0+3)];
                val += gout24 * dm[(j0+1)*nao+(k0+5)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+2)];
                val += gout21 * dm[(j0+2)*nao+(k0+4)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout23 * dm[(j0+0)*nao+(k0+5)];
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+2)];
                val += gout20 * dm[(j0+1)*nao+(k0+4)];
                val += gout8 * dm[(j0+2)*nao+(k0+1)];
                val += gout17 * dm[(j0+2)*nao+(k0+3)];
                val += gout26 * dm[(j0+2)*nao+(k0+5)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(k0+1)];
                val += gout13 * dm[(i0+0)*nao+(k0+3)];
                val += gout22 * dm[(i0+0)*nao+(k0+5)];
                val += gout0 * dm[(i0+2)*nao+(k0+0)];
                val += gout9 * dm[(i0+2)*nao+(k0+2)];
                val += gout18 * dm[(i0+2)*nao+(k0+4)];
                val += gout5 * dm[(i0+4)*nao+(k0+1)];
                val += gout14 * dm[(i0+4)*nao+(k0+3)];
                val += gout23 * dm[(i0+4)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(k0+0)];
                val += gout10 * dm[(i0+0)*nao+(k0+2)];
                val += gout19 * dm[(i0+0)*nao+(k0+4)];
                val += gout6 * dm[(i0+2)*nao+(k0+1)];
                val += gout15 * dm[(i0+2)*nao+(k0+3)];
                val += gout24 * dm[(i0+2)*nao+(k0+5)];
                val += gout2 * dm[(i0+4)*nao+(k0+0)];
                val += gout11 * dm[(i0+4)*nao+(k0+2)];
                val += gout20 * dm[(i0+4)*nao+(k0+4)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(k0+1)];
                val += gout16 * dm[(i0+0)*nao+(k0+3)];
                val += gout25 * dm[(i0+0)*nao+(k0+5)];
                val += gout3 * dm[(i0+2)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+2)];
                val += gout21 * dm[(i0+2)*nao+(k0+4)];
                val += gout8 * dm[(i0+4)*nao+(k0+1)];
                val += gout17 * dm[(i0+4)*nao+(k0+3)];
                val += gout26 * dm[(i0+4)*nao+(k0+5)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+3), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+4), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(l0+0)];
                val += gout25 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+3), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+4), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(l0+0)];
                val += gout5 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(i0+0)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout22 * dm[(i0+0)*nao+(l0+0)];
                val += gout23 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(l0+0)];
                val += gout2 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+3), val);
                val = 0;
                val += gout19 * dm[(i0+0)*nao+(l0+0)];
                val += gout20 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+5), val);
                val = 0;
                val += gout3 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(l0+0)];
                val += gout8 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout16 * dm[(i0+0)*nao+(l0+0)];
                val += gout17 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+3), val);
                val = 0;
                val += gout21 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+4), val);
                val = 0;
                val += gout25 * dm[(i0+0)*nao+(l0+0)];
                val += gout26 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+5), val);
                break;
                case 3:
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                val += gout13 * dm[(j0+0)*nao+(k0+3)];
                val += gout22 * dm[(j0+0)*nao+(k0+5)];
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+2)];
                val += gout19 * dm[(j0+1)*nao+(k0+4)];
                val += gout7 * dm[(j0+2)*nao+(k0+1)];
                val += gout16 * dm[(j0+2)*nao+(k0+3)];
                val += gout25 * dm[(j0+2)*nao+(k0+5)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+2)];
                val += gout18 * dm[(j0+0)*nao+(k0+4)];
                val += gout6 * dm[(j0+1)*nao+(k0+1)];
                val += gout15 * dm[(j0+1)*nao+(k0+3)];
                val += gout24 * dm[(j0+1)*nao+(k0+5)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+2)];
                val += gout21 * dm[(j0+2)*nao+(k0+4)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+1)];
                val += gout14 * dm[(j0+0)*nao+(k0+3)];
                val += gout23 * dm[(j0+0)*nao+(k0+5)];
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+2)];
                val += gout20 * dm[(j0+1)*nao+(k0+4)];
                val += gout8 * dm[(j0+2)*nao+(k0+1)];
                val += gout17 * dm[(j0+2)*nao+(k0+3)];
                val += gout26 * dm[(j0+2)*nao+(k0+5)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(k0+1)];
                val += gout13 * dm[(i0+1)*nao+(k0+3)];
                val += gout22 * dm[(i0+1)*nao+(k0+5)];
                val += gout0 * dm[(i0+3)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+2)];
                val += gout18 * dm[(i0+3)*nao+(k0+4)];
                val += gout5 * dm[(i0+5)*nao+(k0+1)];
                val += gout14 * dm[(i0+5)*nao+(k0+3)];
                val += gout23 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout10 * dm[(i0+1)*nao+(k0+2)];
                val += gout19 * dm[(i0+1)*nao+(k0+4)];
                val += gout6 * dm[(i0+3)*nao+(k0+1)];
                val += gout15 * dm[(i0+3)*nao+(k0+3)];
                val += gout24 * dm[(i0+3)*nao+(k0+5)];
                val += gout2 * dm[(i0+5)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+2)];
                val += gout20 * dm[(i0+5)*nao+(k0+4)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(k0+1)];
                val += gout16 * dm[(i0+1)*nao+(k0+3)];
                val += gout25 * dm[(i0+1)*nao+(k0+5)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout12 * dm[(i0+3)*nao+(k0+2)];
                val += gout21 * dm[(i0+3)*nao+(k0+4)];
                val += gout8 * dm[(i0+5)*nao+(k0+1)];
                val += gout17 * dm[(i0+5)*nao+(k0+3)];
                val += gout26 * dm[(i0+5)*nao+(k0+5)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+3), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+4), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(l0+0)];
                val += gout25 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+3), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+4), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout18 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout22 * dm[(i0+1)*nao+(l0+0)];
                val += gout23 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+3), val);
                val = 0;
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+4), val);
                val = 0;
                val += gout24 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+5), val);
                val = 0;
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout16 * dm[(i0+1)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+3), val);
                val = 0;
                val += gout21 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+4), val);
                val = 0;
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout26 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+5), val);
                break;
                }
            }
        }
    }
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2200(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double trr_40x = c0x * trr_30x + 3*b10 * trr_20x;
                    double hrr_3100x = trr_40x - xjxi * trr_30x;
                    double hrr_2100x = trr_30x - xjxi * trr_20x;
                    double hrr_2200x = hrr_3100x - xjxi * hrr_2100x;
                    gout0 += hrr_2200x * fac * wt;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double hrr_1200x = hrr_2100x - xjxi * hrr_1100x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_1200x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_1200x * fac * trr_10z;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double hrr_0200x = hrr_1100x - xjxi * hrr_0100x;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += hrr_0200x * trr_20y * wt;
                    gout4 += hrr_0200x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += hrr_0200x * fac * trr_20z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout6 += hrr_2100x * hrr_0100y * wt;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout7 += hrr_1100x * hrr_1100y * wt;
                    gout8 += hrr_1100x * hrr_0100y * trr_10z;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    double hrr_2100y = trr_30y - yjyi * trr_20y;
                    gout9 += hrr_0100x * hrr_2100y * wt;
                    gout10 += hrr_0100x * hrr_1100y * trr_10z;
                    gout11 += hrr_0100x * hrr_0100y * trr_20z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout12 += hrr_2100x * fac * hrr_0100z;
                    gout13 += hrr_1100x * trr_10y * hrr_0100z;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout14 += hrr_1100x * fac * hrr_1100z;
                    gout15 += hrr_0100x * trr_20y * hrr_0100z;
                    gout16 += hrr_0100x * trr_10y * hrr_1100z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    double hrr_2100z = trr_30z - zjzi * trr_20z;
                    gout17 += hrr_0100x * fac * hrr_2100z;
                    double hrr_0200y = hrr_1100y - yjyi * hrr_0100y;
                    gout18 += trr_20x * hrr_0200y * wt;
                    double hrr_1200y = hrr_2100y - yjyi * hrr_1100y;
                    gout19 += trr_10x * hrr_1200y * wt;
                    gout20 += trr_10x * hrr_0200y * trr_10z;
                    double trr_40y = c0y * trr_30y + 3*b10 * trr_20y;
                    double hrr_3100y = trr_40y - yjyi * trr_30y;
                    double hrr_2200y = hrr_3100y - yjyi * hrr_2100y;
                    gout21 += 1 * hrr_2200y * wt;
                    gout22 += 1 * hrr_1200y * trr_10z;
                    gout23 += 1 * hrr_0200y * trr_20z;
                    gout24 += trr_20x * hrr_0100y * hrr_0100z;
                    gout25 += trr_10x * hrr_1100y * hrr_0100z;
                    gout26 += trr_10x * hrr_0100y * hrr_1100z;
                    gout27 += 1 * hrr_2100y * hrr_0100z;
                    gout28 += 1 * hrr_1100y * hrr_1100z;
                    gout29 += 1 * hrr_0100y * hrr_2100z;
                    double hrr_0200z = hrr_1100z - zjzi * hrr_0100z;
                    gout30 += trr_20x * fac * hrr_0200z;
                    gout31 += trr_10x * trr_10y * hrr_0200z;
                    double hrr_1200z = hrr_2100z - zjzi * hrr_1100z;
                    gout32 += trr_10x * fac * hrr_1200z;
                    gout33 += 1 * trr_20y * hrr_0200z;
                    gout34 += 1 * trr_10y * hrr_1200z;
                    double trr_40z = c0z * trr_30z + 3*b10 * trr_20z;
                    double hrr_3100z = trr_40z - zjzi * trr_30z;
                    double hrr_2200z = hrr_3100z - zjzi * hrr_2100z;
                    gout35 += 1 * fac * hrr_2200z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+1)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+0)];
                val += gout18 * dm[(j0+3)*nao+(k0+0)];
                val += gout24 * dm[(j0+4)*nao+(k0+0)];
                val += gout30 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+1)*nao+(k0+0)];
                val += gout13 * dm[(j0+2)*nao+(k0+0)];
                val += gout19 * dm[(j0+3)*nao+(k0+0)];
                val += gout25 * dm[(j0+4)*nao+(k0+0)];
                val += gout31 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout8 * dm[(j0+1)*nao+(k0+0)];
                val += gout14 * dm[(j0+2)*nao+(k0+0)];
                val += gout20 * dm[(j0+3)*nao+(k0+0)];
                val += gout26 * dm[(j0+4)*nao+(k0+0)];
                val += gout32 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+1)*nao+(k0+0)];
                val += gout15 * dm[(j0+2)*nao+(k0+0)];
                val += gout21 * dm[(j0+3)*nao+(k0+0)];
                val += gout27 * dm[(j0+4)*nao+(k0+0)];
                val += gout33 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+0)];
                val += gout16 * dm[(j0+2)*nao+(k0+0)];
                val += gout22 * dm[(j0+3)*nao+(k0+0)];
                val += gout28 * dm[(j0+4)*nao+(k0+0)];
                val += gout34 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+2)*nao+(k0+0)];
                val += gout23 * dm[(j0+3)*nao+(k0+0)];
                val += gout29 * dm[(j0+4)*nao+(k0+0)];
                val += gout35 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(k0+0)];
                val += gout13 * dm[(i0+1)*nao+(k0+0)];
                val += gout14 * dm[(i0+2)*nao+(k0+0)];
                val += gout15 * dm[(i0+3)*nao+(k0+0)];
                val += gout16 * dm[(i0+4)*nao+(k0+0)];
                val += gout17 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(k0+0)];
                val += gout19 * dm[(i0+1)*nao+(k0+0)];
                val += gout20 * dm[(i0+2)*nao+(k0+0)];
                val += gout21 * dm[(i0+3)*nao+(k0+0)];
                val += gout22 * dm[(i0+4)*nao+(k0+0)];
                val += gout23 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+3)*nao+(l0+0), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(k0+0)];
                val += gout25 * dm[(i0+1)*nao+(k0+0)];
                val += gout26 * dm[(i0+2)*nao+(k0+0)];
                val += gout27 * dm[(i0+3)*nao+(k0+0)];
                val += gout28 * dm[(i0+4)*nao+(k0+0)];
                val += gout29 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+4)*nao+(l0+0), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(k0+0)];
                val += gout31 * dm[(i0+1)*nao+(k0+0)];
                val += gout32 * dm[(i0+2)*nao+(k0+0)];
                val += gout33 * dm[(i0+3)*nao+(k0+0)];
                val += gout34 * dm[(i0+4)*nao+(k0+0)];
                val += gout35 * dm[(i0+5)*nao+(k0+0)];
                atomicAdd(vk+(j0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+1)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                val += gout18 * dm[(j0+3)*nao+(l0+0)];
                val += gout24 * dm[(j0+4)*nao+(l0+0)];
                val += gout30 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+1)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                val += gout19 * dm[(j0+3)*nao+(l0+0)];
                val += gout25 * dm[(j0+4)*nao+(l0+0)];
                val += gout31 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout8 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                val += gout20 * dm[(j0+3)*nao+(l0+0)];
                val += gout26 * dm[(j0+4)*nao+(l0+0)];
                val += gout32 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout9 * dm[(j0+1)*nao+(l0+0)];
                val += gout15 * dm[(j0+2)*nao+(l0+0)];
                val += gout21 * dm[(j0+3)*nao+(l0+0)];
                val += gout27 * dm[(j0+4)*nao+(l0+0)];
                val += gout33 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout16 * dm[(j0+2)*nao+(l0+0)];
                val += gout22 * dm[(j0+3)*nao+(l0+0)];
                val += gout28 * dm[(j0+4)*nao+(l0+0)];
                val += gout34 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout17 * dm[(j0+2)*nao+(l0+0)];
                val += gout23 * dm[(j0+3)*nao+(l0+0)];
                val += gout29 * dm[(j0+4)*nao+(l0+0)];
                val += gout35 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                val += gout21 * dm[(i0+3)*nao+(l0+0)];
                val += gout22 * dm[(i0+4)*nao+(l0+0)];
                val += gout23 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+0), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                val += gout27 * dm[(i0+3)*nao+(l0+0)];
                val += gout28 * dm[(i0+4)*nao+(l0+0)];
                val += gout29 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+0), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+0)];
                val += gout33 * dm[(i0+3)*nao+(l0+0)];
                val += gout34 * dm[(i0+4)*nao+(l0+0)];
                val += gout35 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_2210(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int gout_id = threadIdx.y;
    int thread_id = 64 * gout_id + sq_id;
    int threads = 256;
    constexpr int nsq_per_block = 64;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (thread_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (thread_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    constexpr int g_size = 18;

    extern __shared__ double shared_memory[];
    double *rlrk = shared_memory + sq_id;
    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
    double *akl_cache = shared_memory + nsq_per_block * 6 + sq_id;
    double *fac_ijkl = shared_memory + nsq_per_block * 8 + sq_id;
    double *gx = shared_memory + nsq_per_block * 9 + sq_id;
    double *rw = shared_memory + nsq_per_block * (g_size*3+9) + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * (g_size*3+bounds.nroots*2+9);

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double aij_cache[2];
    __shared__ double *expi;
    __shared__ double *expj;
    if (thread_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (thread_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[thread_id] = env[ri_ptr+thread_id];
        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }

    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        int iprim = bounds.iprim;
        int jprim = bounds.jprim;
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        if (gout_id == 0) {
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            rlrk[0] = xlxk;
            rlrk[64] = ylyk;
            rlrk[128] = zlzk;
            fac_ijkl[0] = fac_sym;
        }
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            __syncthreads();
            if (gout_id == 0) {
                double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
                double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
                double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
                double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
                int kp = klp / lprim;
                int lp = klp % lprim;
                double ak = expk[kp];
                double al = expl[lp];
                double akl = ak + al;
                double al_akl = al / akl;
                double xlxk = rlrk[0];
                double ylyk = rlrk[64];
                double zlzk = rlrk[128];
                double theta_kl = ak * al_akl;
                double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
                double ckcl = ck[kp] * cl[lp] * Kcd;
                double fac_sym = fac_ijkl[0];
                gx[0] = fac_sym * ckcl;
                akl_cache[0] = akl;
                akl_cache[nsq_per_block] = al_akl;
            }
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double akl = akl_cache[0];
                double al_akl = akl_cache[nsq_per_block];
                double xij = ri[0] + (rjri[0]) * aj_aij;
                double yij = ri[1] + (rjri[1]) * aj_aij;
                double zij = ri[2] + (rjri[2]) * aj_aij;
                double xkl = rk[0] + rlrk[0*nsq_per_block] * al_akl;
                double ykl = rk[1] + rlrk[1*nsq_per_block] * al_akl;
                double zkl = rk[2] + rlrk[2*nsq_per_block] * al_akl;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                if (gout_id == 0) {
                    Rpq[0*nsq_per_block] = xpq;
                    Rpq[1*nsq_per_block] = ypq;
                    Rpq[2*nsq_per_block] = zpq;
                    double cicj = cicj_cache[ijp];
                    gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
                    if (sq_id == 0) {
                        aij_cache[0] = aij;
                        aij_cache[1] = aj_aij;
                    }
                }
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                for (int irys = 0; irys < nroots; ++irys) {
                    __syncthreads();
                    double s0, s1, s2;
                    double rt = rw[irys*128];
                    double aij = aij_cache[0];
                    double rt_aa = rt / (aij + akl);
                    double akl = akl_cache[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double rt_akl = rt_aa * aij;
                    double b00 = .5 * rt_aa;
                    for (int n = gout_id; n < 3; n += 4) {
                        if (n == 2) {
                            gx[2304] = rw[irys*128+64];
                        }
                        double *_gx = gx + n * 1152;
                        double xjxi = rjri[n];
                        double Rpa = xjxi * aij_cache[1];
                        double c0x = Rpa - rt_aij * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = c0x * s0;
                        _gx[64] = s1;
                        s2 = c0x * s1 + 1 * b10 * s0;
                        _gx[128] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 2 * b10 * s0;
                        _gx[192] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 3 * b10 * s0;
                        _gx[256] = s2;
                        double xlxk = rlrk[n*64];
                        double Rqc = xlxk * akl_cache[64];
                        double cpx = Rqc + rt_akl * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = cpx * s0;
                        _gx[576] = s1;
                        s0 = _gx[64];
                        s1 = cpx * s0;
                        s1 += 1 * b00 * _gx[0];
                        _gx[640] = s1;
                        s0 = _gx[128];
                        s1 = cpx * s0;
                        s1 += 2 * b00 * _gx[64];
                        _gx[704] = s1;
                        s0 = _gx[192];
                        s1 = cpx * s0;
                        s1 += 3 * b00 * _gx[128];
                        _gx[768] = s1;
                        s0 = _gx[256];
                        s1 = cpx * s0;
                        s1 += 4 * b00 * _gx[192];
                        _gx[832] = s1;
                        s1 = _gx[256];
                        s0 = _gx[192];
                        _gx[384] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[128];
                        _gx[320] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[64];
                        _gx[256] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[192] = s1 - xjxi * s0;
                        s1 = _gx[384];
                        s0 = _gx[320];
                        _gx[512] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[256];
                        _gx[448] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[192];
                        _gx[384] = s1 - xjxi * s0;
                        s1 = _gx[832];
                        s0 = _gx[768];
                        _gx[960] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[704];
                        _gx[896] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[640];
                        _gx[832] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[576];
                        _gx[768] = s1 - xjxi * s0;
                        s1 = _gx[960];
                        s0 = _gx[896];
                        _gx[1088] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[832];
                        _gx[1024] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[768];
                        _gx[960] = s1 - xjxi * s0;
                    }
                    __syncthreads();
                    switch (gout_id) {
                    case 0:
                    gout0 += gx[1088] * gx[1152] * gx[2304];
                    gout1 += gx[960] * gx[1216] * gx[2368];
                    gout2 += gx[832] * gx[1344] * gx[2368];
                    gout3 += gx[896] * gx[1152] * gx[2496];
                    gout4 += gx[768] * gx[1216] * gx[2560];
                    gout5 += gx[640] * gx[1536] * gx[2368];
                    gout6 += gx[704] * gx[1344] * gx[2496];
                    gout7 += gx[576] * gx[1408] * gx[2560];
                    gout8 += gx[640] * gx[1152] * gx[2752];
                    gout9 += gx[512] * gx[1728] * gx[2304];
                    gout10 += gx[384] * gx[1792] * gx[2368];
                    gout11 += gx[256] * gx[1920] * gx[2368];
                    gout12 += gx[320] * gx[1728] * gx[2496];
                    gout13 += gx[192] * gx[1792] * gx[2560];
                    gout14 += gx[64] * gx[2112] * gx[2368];
                    gout15 += gx[128] * gx[1920] * gx[2496];
                    gout16 += gx[0] * gx[1984] * gx[2560];
                    gout17 += gx[64] * gx[1728] * gx[2752];
                    gout18 += gx[512] * gx[1152] * gx[2880];
                    gout19 += gx[384] * gx[1216] * gx[2944];
                    gout20 += gx[256] * gx[1344] * gx[2944];
                    gout21 += gx[320] * gx[1152] * gx[3072];
                    gout22 += gx[192] * gx[1216] * gx[3136];
                    gout23 += gx[64] * gx[1536] * gx[2944];
                    gout24 += gx[128] * gx[1344] * gx[3072];
                    gout25 += gx[0] * gx[1408] * gx[3136];
                    gout26 += gx[64] * gx[1152] * gx[3328];
                    break;
                    case 1:
                    gout0 += gx[1024] * gx[1216] * gx[2304];
                    gout1 += gx[960] * gx[1152] * gx[2432];
                    gout2 += gx[768] * gx[1472] * gx[2304];
                    gout3 += gx[832] * gx[1216] * gx[2496];
                    gout4 += gx[768] * gx[1152] * gx[2624];
                    gout5 += gx[576] * gx[1664] * gx[2304];
                    gout6 += gx[640] * gx[1408] * gx[2496];
                    gout7 += gx[576] * gx[1344] * gx[2624];
                    gout8 += gx[576] * gx[1280] * gx[2688];
                    gout9 += gx[448] * gx[1792] * gx[2304];
                    gout10 += gx[384] * gx[1728] * gx[2432];
                    gout11 += gx[192] * gx[2048] * gx[2304];
                    gout12 += gx[256] * gx[1792] * gx[2496];
                    gout13 += gx[192] * gx[1728] * gx[2624];
                    gout14 += gx[0] * gx[2240] * gx[2304];
                    gout15 += gx[64] * gx[1984] * gx[2496];
                    gout16 += gx[0] * gx[1920] * gx[2624];
                    gout17 += gx[0] * gx[1856] * gx[2688];
                    gout18 += gx[448] * gx[1216] * gx[2880];
                    gout19 += gx[384] * gx[1152] * gx[3008];
                    gout20 += gx[192] * gx[1472] * gx[2880];
                    gout21 += gx[256] * gx[1216] * gx[3072];
                    gout22 += gx[192] * gx[1152] * gx[3200];
                    gout23 += gx[0] * gx[1664] * gx[2880];
                    gout24 += gx[64] * gx[1408] * gx[3072];
                    gout25 += gx[0] * gx[1344] * gx[3200];
                    gout26 += gx[0] * gx[1280] * gx[3264];
                    break;
                    case 2:
                    gout0 += gx[1024] * gx[1152] * gx[2368];
                    gout1 += gx[896] * gx[1344] * gx[2304];
                    gout2 += gx[768] * gx[1408] * gx[2368];
                    gout3 += gx[832] * gx[1152] * gx[2560];
                    gout4 += gx[704] * gx[1536] * gx[2304];
                    gout5 += gx[576] * gx[1600] * gx[2368];
                    gout6 += gx[640] * gx[1344] * gx[2560];
                    gout7 += gx[704] * gx[1152] * gx[2688];
                    gout8 += gx[576] * gx[1216] * gx[2752];
                    gout9 += gx[448] * gx[1728] * gx[2368];
                    gout10 += gx[320] * gx[1920] * gx[2304];
                    gout11 += gx[192] * gx[1984] * gx[2368];
                    gout12 += gx[256] * gx[1728] * gx[2560];
                    gout13 += gx[128] * gx[2112] * gx[2304];
                    gout14 += gx[0] * gx[2176] * gx[2368];
                    gout15 += gx[64] * gx[1920] * gx[2560];
                    gout16 += gx[128] * gx[1728] * gx[2688];
                    gout17 += gx[0] * gx[1792] * gx[2752];
                    gout18 += gx[448] * gx[1152] * gx[2944];
                    gout19 += gx[320] * gx[1344] * gx[2880];
                    gout20 += gx[192] * gx[1408] * gx[2944];
                    gout21 += gx[256] * gx[1152] * gx[3136];
                    gout22 += gx[128] * gx[1536] * gx[2880];
                    gout23 += gx[0] * gx[1600] * gx[2944];
                    gout24 += gx[64] * gx[1344] * gx[3136];
                    gout25 += gx[128] * gx[1152] * gx[3264];
                    gout26 += gx[0] * gx[1216] * gx[3328];
                    break;
                    case 3:
                    gout0 += gx[960] * gx[1280] * gx[2304];
                    gout1 += gx[832] * gx[1408] * gx[2304];
                    gout2 += gx[768] * gx[1344] * gx[2432];
                    gout3 += gx[768] * gx[1280] * gx[2496];
                    gout4 += gx[640] * gx[1600] * gx[2304];
                    gout5 += gx[576] * gx[1536] * gx[2432];
                    gout6 += gx[576] * gx[1472] * gx[2496];
                    gout7 += gx[640] * gx[1216] * gx[2688];
                    gout8 += gx[576] * gx[1152] * gx[2816];
                    gout9 += gx[384] * gx[1856] * gx[2304];
                    gout10 += gx[256] * gx[1984] * gx[2304];
                    gout11 += gx[192] * gx[1920] * gx[2432];
                    gout12 += gx[192] * gx[1856] * gx[2496];
                    gout13 += gx[64] * gx[2176] * gx[2304];
                    gout14 += gx[0] * gx[2112] * gx[2432];
                    gout15 += gx[0] * gx[2048] * gx[2496];
                    gout16 += gx[64] * gx[1792] * gx[2688];
                    gout17 += gx[0] * gx[1728] * gx[2816];
                    gout18 += gx[384] * gx[1280] * gx[2880];
                    gout19 += gx[256] * gx[1408] * gx[2880];
                    gout20 += gx[192] * gx[1344] * gx[3008];
                    gout21 += gx[192] * gx[1280] * gx[3072];
                    gout22 += gx[64] * gx[1600] * gx[2880];
                    gout23 += gx[0] * gx[1536] * gx[3008];
                    gout24 += gx[0] * gx[1472] * gx[3072];
                    gout25 += gx[64] * gx[1216] * gx[3264];
                    gout26 += gx[0] * gx[1152] * gx[3392];
                    break;
                    }
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                switch (gout_id) {
                case 0:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+1)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                val += gout6 * dm[(j0+4)*nao+(k0+0)];
                val += gout15 * dm[(j0+4)*nao+(k0+1)];
                val += gout24 * dm[(j0+4)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout20 * dm[(j0+1)*nao+(k0+2)];
                val += gout5 * dm[(j0+3)*nao+(k0+0)];
                val += gout14 * dm[(j0+3)*nao+(k0+1)];
                val += gout23 * dm[(j0+3)*nao+(k0+2)];
                val += gout8 * dm[(j0+5)*nao+(k0+0)];
                val += gout17 * dm[(j0+5)*nao+(k0+1)];
                val += gout26 * dm[(j0+5)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+0)*nao+(k0+2)];
                val += gout4 * dm[(j0+2)*nao+(k0+0)];
                val += gout13 * dm[(j0+2)*nao+(k0+1)];
                val += gout22 * dm[(j0+2)*nao+(k0+2)];
                val += gout7 * dm[(j0+4)*nao+(k0+0)];
                val += gout16 * dm[(j0+4)*nao+(k0+1)];
                val += gout25 * dm[(j0+4)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout9 * dm[(i0+0)*nao+(k0+1)];
                val += gout18 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+4)*nao+(k0+0)];
                val += gout10 * dm[(i0+4)*nao+(k0+1)];
                val += gout19 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout11 * dm[(i0+2)*nao+(k0+1)];
                val += gout20 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+1)];
                val += gout21 * dm[(i0+0)*nao+(k0+2)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout13 * dm[(i0+4)*nao+(k0+1)];
                val += gout22 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(k0+0)];
                val += gout14 * dm[(i0+2)*nao+(k0+1)];
                val += gout23 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+3)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(k0+0)];
                val += gout15 * dm[(i0+0)*nao+(k0+1)];
                val += gout24 * dm[(i0+0)*nao+(k0+2)];
                val += gout7 * dm[(i0+4)*nao+(k0+0)];
                val += gout16 * dm[(i0+4)*nao+(k0+1)];
                val += gout25 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+4)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout17 * dm[(i0+2)*nao+(k0+1)];
                val += gout26 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                val += gout6 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                val += gout15 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                val += gout24 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                val += gout5 * dm[(j0+3)*nao+(l0+0)];
                val += gout8 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+3)*nao+(l0+0)];
                val += gout17 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                val += gout23 * dm[(j0+3)*nao+(l0+0)];
                val += gout26 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                val += gout7 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                val += gout16 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                val += gout25 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+0)*nao+(l0+0)];
                val += gout10 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+0)*nao+(l0+0)];
                val += gout19 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+0)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(i0+0)*nao+(l0+0)];
                val += gout22 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+0), val);
                val = 0;
                val += gout14 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+1), val);
                val = 0;
                val += gout23 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+0)*nao+(l0+0)];
                val += gout7 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(i0+0)*nao+(l0+0)];
                val += gout25 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+1), val);
                val = 0;
                val += gout26 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+2), val);
                break;
                case 1:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+1)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                val += gout6 * dm[(j0+4)*nao+(k0+0)];
                val += gout15 * dm[(j0+4)*nao+(k0+1)];
                val += gout24 * dm[(j0+4)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout20 * dm[(j0+1)*nao+(k0+2)];
                val += gout5 * dm[(j0+3)*nao+(k0+0)];
                val += gout14 * dm[(j0+3)*nao+(k0+1)];
                val += gout23 * dm[(j0+3)*nao+(k0+2)];
                val += gout8 * dm[(j0+5)*nao+(k0+0)];
                val += gout17 * dm[(j0+5)*nao+(k0+1)];
                val += gout26 * dm[(j0+5)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout19 * dm[(j0+0)*nao+(k0+2)];
                val += gout4 * dm[(j0+2)*nao+(k0+0)];
                val += gout13 * dm[(j0+2)*nao+(k0+1)];
                val += gout22 * dm[(j0+2)*nao+(k0+2)];
                val += gout7 * dm[(j0+4)*nao+(k0+0)];
                val += gout16 * dm[(j0+4)*nao+(k0+1)];
                val += gout25 * dm[(j0+4)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout9 * dm[(i0+1)*nao+(k0+1)];
                val += gout18 * dm[(i0+1)*nao+(k0+2)];
                val += gout1 * dm[(i0+5)*nao+(k0+0)];
                val += gout10 * dm[(i0+5)*nao+(k0+1)];
                val += gout19 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(i0+3)*nao+(k0+0)];
                val += gout11 * dm[(i0+3)*nao+(k0+1)];
                val += gout20 * dm[(i0+3)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+1)*nao+(k0+1)];
                val += gout21 * dm[(i0+1)*nao+(k0+2)];
                val += gout4 * dm[(i0+5)*nao+(k0+0)];
                val += gout13 * dm[(i0+5)*nao+(k0+1)];
                val += gout22 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(k0+0)];
                val += gout14 * dm[(i0+3)*nao+(k0+1)];
                val += gout23 * dm[(i0+3)*nao+(k0+2)];
                atomicAdd(vk+(j0+3)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(k0+0)];
                val += gout15 * dm[(i0+1)*nao+(k0+1)];
                val += gout24 * dm[(i0+1)*nao+(k0+2)];
                val += gout7 * dm[(i0+5)*nao+(k0+0)];
                val += gout16 * dm[(i0+5)*nao+(k0+1)];
                val += gout25 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+4)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(i0+3)*nao+(k0+0)];
                val += gout17 * dm[(i0+3)*nao+(k0+1)];
                val += gout26 * dm[(i0+3)*nao+(k0+2)];
                atomicAdd(vk+(j0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                val += gout6 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                val += gout15 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                val += gout24 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                val += gout5 * dm[(j0+3)*nao+(l0+0)];
                val += gout8 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+3)*nao+(l0+0)];
                val += gout17 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                val += gout23 * dm[(j0+3)*nao+(l0+0)];
                val += gout26 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout4 * dm[(j0+2)*nao+(l0+0)];
                val += gout7 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                val += gout16 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                val += gout25 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout1 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+1)*nao+(l0+0)];
                val += gout10 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+1)*nao+(l0+0)];
                val += gout19 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+1)*nao+(l0+0)];
                val += gout4 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(l0+0)];
                val += gout13 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(i0+1)*nao+(l0+0)];
                val += gout22 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+0), val);
                val = 0;
                val += gout14 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+1), val);
                val = 0;
                val += gout23 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+1)*nao+(l0+0)];
                val += gout7 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(l0+0)];
                val += gout16 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(i0+1)*nao+(l0+0)];
                val += gout25 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+1), val);
                val = 0;
                val += gout26 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+2), val);
                break;
                case 2:
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                val += gout4 * dm[(j0+3)*nao+(k0+0)];
                val += gout13 * dm[(j0+3)*nao+(k0+1)];
                val += gout22 * dm[(j0+3)*nao+(k0+2)];
                val += gout7 * dm[(j0+5)*nao+(k0+0)];
                val += gout16 * dm[(j0+5)*nao+(k0+1)];
                val += gout25 * dm[(j0+5)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+1)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                val += gout6 * dm[(j0+4)*nao+(k0+0)];
                val += gout15 * dm[(j0+4)*nao+(k0+1)];
                val += gout24 * dm[(j0+4)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout20 * dm[(j0+1)*nao+(k0+2)];
                val += gout5 * dm[(j0+3)*nao+(k0+0)];
                val += gout14 * dm[(j0+3)*nao+(k0+1)];
                val += gout23 * dm[(j0+3)*nao+(k0+2)];
                val += gout8 * dm[(j0+5)*nao+(k0+0)];
                val += gout17 * dm[(j0+5)*nao+(k0+1)];
                val += gout26 * dm[(j0+5)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+2)*nao+(k0+0)];
                val += gout9 * dm[(i0+2)*nao+(k0+1)];
                val += gout18 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(k0+0)];
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout19 * dm[(i0+0)*nao+(k0+2)];
                val += gout2 * dm[(i0+4)*nao+(k0+0)];
                val += gout11 * dm[(i0+4)*nao+(k0+1)];
                val += gout20 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+2)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+1)];
                val += gout21 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(k0+0)];
                val += gout13 * dm[(i0+0)*nao+(k0+1)];
                val += gout22 * dm[(i0+0)*nao+(k0+2)];
                val += gout5 * dm[(i0+4)*nao+(k0+0)];
                val += gout14 * dm[(i0+4)*nao+(k0+1)];
                val += gout23 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+3)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+2)*nao+(k0+0)];
                val += gout15 * dm[(i0+2)*nao+(k0+1)];
                val += gout24 * dm[(i0+2)*nao+(k0+2)];
                atomicAdd(vk+(j0+4)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(k0+0)];
                val += gout16 * dm[(i0+0)*nao+(k0+1)];
                val += gout25 * dm[(i0+0)*nao+(k0+2)];
                val += gout8 * dm[(i0+4)*nao+(k0+0)];
                val += gout17 * dm[(i0+4)*nao+(k0+1)];
                val += gout26 * dm[(i0+4)*nao+(k0+2)];
                atomicAdd(vk+(j0+5)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                val += gout4 * dm[(j0+3)*nao+(l0+0)];
                val += gout7 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout13 * dm[(j0+3)*nao+(l0+0)];
                val += gout16 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                val += gout22 * dm[(j0+3)*nao+(l0+0)];
                val += gout25 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                val += gout6 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                val += gout15 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                val += gout24 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                val += gout5 * dm[(j0+3)*nao+(l0+0)];
                val += gout8 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+3)*nao+(l0+0)];
                val += gout17 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                val += gout23 * dm[(j0+3)*nao+(l0+0)];
                val += gout26 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+0)*nao+(l0+0)];
                val += gout2 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(i0+0)*nao+(l0+0)];
                val += gout20 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(i0+0)*nao+(l0+0)];
                val += gout5 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+0), val);
                val = 0;
                val += gout13 * dm[(i0+0)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+1), val);
                val = 0;
                val += gout22 * dm[(i0+0)*nao+(l0+0)];
                val += gout23 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(i0+2)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(l0+0)];
                val += gout8 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(i0+0)*nao+(l0+0)];
                val += gout17 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+1), val);
                val = 0;
                val += gout25 * dm[(i0+0)*nao+(l0+0)];
                val += gout26 * dm[(i0+4)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+2), val);
                break;
                case 3:
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                val += gout4 * dm[(j0+3)*nao+(k0+0)];
                val += gout13 * dm[(j0+3)*nao+(k0+1)];
                val += gout22 * dm[(j0+3)*nao+(k0+2)];
                val += gout7 * dm[(j0+5)*nao+(k0+0)];
                val += gout16 * dm[(j0+5)*nao+(k0+1)];
                val += gout25 * dm[(j0+5)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout18 * dm[(j0+0)*nao+(k0+2)];
                val += gout3 * dm[(j0+2)*nao+(k0+0)];
                val += gout12 * dm[(j0+2)*nao+(k0+1)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                val += gout6 * dm[(j0+4)*nao+(k0+0)];
                val += gout15 * dm[(j0+4)*nao+(k0+1)];
                val += gout24 * dm[(j0+4)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout20 * dm[(j0+1)*nao+(k0+2)];
                val += gout5 * dm[(j0+3)*nao+(k0+0)];
                val += gout14 * dm[(j0+3)*nao+(k0+1)];
                val += gout23 * dm[(j0+3)*nao+(k0+2)];
                val += gout8 * dm[(j0+5)*nao+(k0+0)];
                val += gout17 * dm[(j0+5)*nao+(k0+1)];
                val += gout26 * dm[(j0+5)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+3)*nao+(k0+0)];
                val += gout9 * dm[(i0+3)*nao+(k0+1)];
                val += gout18 * dm[(i0+3)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout19 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+5)*nao+(k0+0)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout20 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout12 * dm[(i0+3)*nao+(k0+1)];
                val += gout21 * dm[(i0+3)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(k0+0)];
                val += gout13 * dm[(i0+1)*nao+(k0+1)];
                val += gout22 * dm[(i0+1)*nao+(k0+2)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout14 * dm[(i0+5)*nao+(k0+1)];
                val += gout23 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+3)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(i0+3)*nao+(k0+0)];
                val += gout15 * dm[(i0+3)*nao+(k0+1)];
                val += gout24 * dm[(i0+3)*nao+(k0+2)];
                atomicAdd(vk+(j0+4)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout16 * dm[(i0+1)*nao+(k0+1)];
                val += gout25 * dm[(i0+1)*nao+(k0+2)];
                val += gout8 * dm[(i0+5)*nao+(k0+0)];
                val += gout17 * dm[(i0+5)*nao+(k0+1)];
                val += gout26 * dm[(i0+5)*nao+(k0+2)];
                atomicAdd(vk+(j0+5)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+1)*nao+(l0+0)];
                val += gout4 * dm[(j0+3)*nao+(l0+0)];
                val += gout7 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout13 * dm[(j0+3)*nao+(l0+0)];
                val += gout16 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                val += gout22 * dm[(j0+3)*nao+(l0+0)];
                val += gout25 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout3 * dm[(j0+2)*nao+(l0+0)];
                val += gout6 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                val += gout15 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                val += gout24 * dm[(j0+4)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                val += gout5 * dm[(j0+3)*nao+(l0+0)];
                val += gout8 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout14 * dm[(j0+3)*nao+(l0+0)];
                val += gout17 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+1)*nao+(l0+0)];
                val += gout23 * dm[(j0+3)*nao+(l0+0)];
                val += gout26 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(i0+1)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+0), val);
                val = 0;
                val += gout13 * dm[(i0+1)*nao+(l0+0)];
                val += gout14 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+1), val);
                val = 0;
                val += gout22 * dm[(i0+1)*nao+(l0+0)];
                val += gout23 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(i0+3)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(i0+1)*nao+(l0+0)];
                val += gout17 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+1), val);
                val = 0;
                val += gout25 * dm[(i0+1)*nao+(l0+0)];
                val += gout26 * dm[(i0+5)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+2), val);
                break;
                }
            }
        }
    }
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

#if CUDA_VERSION >= 12040
__global__ __maxnreg__(128) static
#else
__global__ static
#endif
void rys_k_3000(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    gout0 += trr_30x * fac * wt;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_20x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_20x * fac * trr_10z;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += trr_10x * trr_20y * wt;
                    gout4 += trr_10x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += trr_10x * fac * trr_20z;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    gout6 += 1 * trr_30y * wt;
                    gout7 += 1 * trr_20y * trr_10z;
                    gout8 += 1 * trr_10y * trr_20z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    gout9 += 1 * fac * trr_30z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout6 * dm[(i0+6)*nao+(k0+0)];
                val += gout7 * dm[(i0+7)*nao+(k0+0)];
                val += gout8 * dm[(i0+8)*nao+(k0+0)];
                val += gout9 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                val += gout7 * dm[(i0+7)*nao+(l0+0)];
                val += gout8 * dm[(i0+8)*nao+(l0+0)];
                val += gout9 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_3010(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double trr_31x = cpx * trr_30x + 3*b00 * trr_20x;
                    gout0 += trr_31x * fac * wt;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_21x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_21x * fac * trr_10z;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += trr_11x * trr_20y * wt;
                    gout4 += trr_11x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += trr_11x * fac * trr_20z;
                    double trr_01x = cpx * 1;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    gout6 += trr_01x * trr_30y * wt;
                    gout7 += trr_01x * trr_20y * trr_10z;
                    gout8 += trr_01x * trr_10y * trr_20z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    gout9 += trr_01x * fac * trr_30z;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout10 += trr_30x * trr_01y * wt;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout11 += trr_20x * trr_11y * wt;
                    gout12 += trr_20x * trr_01y * trr_10z;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    gout13 += trr_10x * trr_21y * wt;
                    gout14 += trr_10x * trr_11y * trr_10z;
                    gout15 += trr_10x * trr_01y * trr_20z;
                    double trr_31y = cpy * trr_30y + 3*b00 * trr_20y;
                    gout16 += 1 * trr_31y * wt;
                    gout17 += 1 * trr_21y * trr_10z;
                    gout18 += 1 * trr_11y * trr_20z;
                    gout19 += 1 * trr_01y * trr_30z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout20 += trr_30x * fac * trr_01z;
                    gout21 += trr_20x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout22 += trr_20x * fac * trr_11z;
                    gout23 += trr_10x * trr_20y * trr_01z;
                    gout24 += trr_10x * trr_10y * trr_11z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    gout25 += trr_10x * fac * trr_21z;
                    gout26 += 1 * trr_30y * trr_01z;
                    gout27 += 1 * trr_20y * trr_11z;
                    gout28 += 1 * trr_10y * trr_21z;
                    double trr_31z = cpz * trr_30z + 3*b00 * trr_20z;
                    gout29 += 1 * fac * trr_31z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+1)];
                val += gout22 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+1)];
                val += gout23 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+1)];
                val += gout24 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+0)*nao+(k0+1)];
                val += gout25 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+0)*nao+(k0+1)];
                val += gout26 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+0)*nao+(k0+1)];
                val += gout27 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                val += gout28 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                val += gout29 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout20 * dm[(i0+0)*nao+(k0+2)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout11 * dm[(i0+1)*nao+(k0+1)];
                val += gout21 * dm[(i0+1)*nao+(k0+2)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+1)];
                val += gout22 * dm[(i0+2)*nao+(k0+2)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout13 * dm[(i0+3)*nao+(k0+1)];
                val += gout23 * dm[(i0+3)*nao+(k0+2)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout14 * dm[(i0+4)*nao+(k0+1)];
                val += gout24 * dm[(i0+4)*nao+(k0+2)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout15 * dm[(i0+5)*nao+(k0+1)];
                val += gout25 * dm[(i0+5)*nao+(k0+2)];
                val += gout6 * dm[(i0+6)*nao+(k0+0)];
                val += gout16 * dm[(i0+6)*nao+(k0+1)];
                val += gout26 * dm[(i0+6)*nao+(k0+2)];
                val += gout7 * dm[(i0+7)*nao+(k0+0)];
                val += gout17 * dm[(i0+7)*nao+(k0+1)];
                val += gout27 * dm[(i0+7)*nao+(k0+2)];
                val += gout8 * dm[(i0+8)*nao+(k0+0)];
                val += gout18 * dm[(i0+8)*nao+(k0+1)];
                val += gout28 * dm[(i0+8)*nao+(k0+2)];
                val += gout9 * dm[(i0+9)*nao+(k0+0)];
                val += gout19 * dm[(i0+9)*nao+(k0+1)];
                val += gout29 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout25 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+1), val);
                val = 0;
                val += gout26 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+1), val);
                val = 0;
                val += gout27 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+1), val);
                val = 0;
                val += gout28 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+2), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+1), val);
                val = 0;
                val += gout29 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                val += gout7 * dm[(i0+7)*nao+(l0+0)];
                val += gout8 * dm[(i0+8)*nao+(l0+0)];
                val += gout9 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+1)*nao+(l0+0)];
                val += gout12 * dm[(i0+2)*nao+(l0+0)];
                val += gout13 * dm[(i0+3)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+0)];
                val += gout15 * dm[(i0+5)*nao+(l0+0)];
                val += gout16 * dm[(i0+6)*nao+(l0+0)];
                val += gout17 * dm[(i0+7)*nao+(l0+0)];
                val += gout18 * dm[(i0+8)*nao+(l0+0)];
                val += gout19 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(l0+0)];
                val += gout21 * dm[(i0+1)*nao+(l0+0)];
                val += gout22 * dm[(i0+2)*nao+(l0+0)];
                val += gout23 * dm[(i0+3)*nao+(l0+0)];
                val += gout24 * dm[(i0+4)*nao+(l0+0)];
                val += gout25 * dm[(i0+5)*nao+(l0+0)];
                val += gout26 * dm[(i0+6)*nao+(l0+0)];
                val += gout27 * dm[(i0+7)*nao+(l0+0)];
                val += gout28 * dm[(i0+8)*nao+(l0+0)];
                val += gout29 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_3011(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int gout_id = threadIdx.y;
    int thread_id = 64 * gout_id + sq_id;
    int threads = 256;
    constexpr int nsq_per_block = 64;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (thread_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (thread_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    constexpr int g_size = 16;

    extern __shared__ double shared_memory[];
    double *rlrk = shared_memory + sq_id;
    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
    double *akl_cache = shared_memory + nsq_per_block * 6 + sq_id;
    double *fac_ijkl = shared_memory + nsq_per_block * 8 + sq_id;
    double *gx = shared_memory + nsq_per_block * 9 + sq_id;
    double *rw = shared_memory + nsq_per_block * (g_size*3+9) + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * (g_size*3+bounds.nroots*2+9);

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double aij_cache[2];
    __shared__ double *expi;
    __shared__ double *expj;
    if (thread_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (thread_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[thread_id] = env[ri_ptr+thread_id];
        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }

    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        int iprim = bounds.iprim;
        int jprim = bounds.jprim;
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        if (gout_id == 0) {
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            rlrk[0] = xlxk;
            rlrk[64] = ylyk;
            rlrk[128] = zlzk;
            fac_ijkl[0] = fac_sym;
        }
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            __syncthreads();
            if (gout_id == 0) {
                double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
                double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
                double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
                double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
                int kp = klp / lprim;
                int lp = klp % lprim;
                double ak = expk[kp];
                double al = expl[lp];
                double akl = ak + al;
                double al_akl = al / akl;
                double xlxk = rlrk[0];
                double ylyk = rlrk[64];
                double zlzk = rlrk[128];
                double theta_kl = ak * al_akl;
                double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
                double ckcl = ck[kp] * cl[lp] * Kcd;
                double fac_sym = fac_ijkl[0];
                gx[0] = fac_sym * ckcl;
                akl_cache[0] = akl;
                akl_cache[nsq_per_block] = al_akl;
            }
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double akl = akl_cache[0];
                double al_akl = akl_cache[nsq_per_block];
                double xij = ri[0] + (rjri[0]) * aj_aij;
                double yij = ri[1] + (rjri[1]) * aj_aij;
                double zij = ri[2] + (rjri[2]) * aj_aij;
                double xkl = rk[0] + rlrk[0*nsq_per_block] * al_akl;
                double ykl = rk[1] + rlrk[1*nsq_per_block] * al_akl;
                double zkl = rk[2] + rlrk[2*nsq_per_block] * al_akl;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                if (gout_id == 0) {
                    Rpq[0*nsq_per_block] = xpq;
                    Rpq[1*nsq_per_block] = ypq;
                    Rpq[2*nsq_per_block] = zpq;
                    double cicj = cicj_cache[ijp];
                    gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
                    if (sq_id == 0) {
                        aij_cache[0] = aij;
                        aij_cache[1] = aj_aij;
                    }
                }
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                for (int irys = 0; irys < nroots; ++irys) {
                    __syncthreads();
                    double s0, s1, s2;
                    double rt = rw[irys*128];
                    double aij = aij_cache[0];
                    double rt_aa = rt / (aij + akl);
                    double akl = akl_cache[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double rt_akl = rt_aa * aij;
                    double b00 = .5 * rt_aa;
                    double b01 = .5/akl * (1 - rt_akl);
                    for (int n = gout_id; n < 3; n += 4) {
                        if (n == 2) {
                            gx[2048] = rw[irys*128+64];
                        }
                        double *_gx = gx + n * 1024;
                        double xjxi = rjri[n];
                        double Rpa = xjxi * aij_cache[1];
                        double c0x = Rpa - rt_aij * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = c0x * s0;
                        _gx[64] = s1;
                        s2 = c0x * s1 + 1 * b10 * s0;
                        _gx[128] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 2 * b10 * s0;
                        _gx[192] = s2;
                        double xlxk = rlrk[n*64];
                        double Rqc = xlxk * akl_cache[64];
                        double cpx = Rqc + rt_akl * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = cpx * s0;
                        _gx[256] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        _gx[512] = s2;
                        s0 = _gx[64];
                        s1 = cpx * s0;
                        s1 += 1 * b00 * _gx[0];
                        _gx[320] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 1 * b00 * _gx[256];
                        _gx[576] = s2;
                        s0 = _gx[128];
                        s1 = cpx * s0;
                        s1 += 2 * b00 * _gx[64];
                        _gx[384] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 2 * b00 * _gx[320];
                        _gx[640] = s2;
                        s0 = _gx[192];
                        s1 = cpx * s0;
                        s1 += 3 * b00 * _gx[128];
                        _gx[448] = s1;
                        s2 = cpx*s1 + 1 * b01 *s0;
                        s2 += 3 * b00 * _gx[384];
                        _gx[704] = s2;
                        s1 = _gx[512];
                        s0 = _gx[256];
                        _gx[768] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[512] = s1 - xlxk * s0;
                        s1 = _gx[576];
                        s0 = _gx[320];
                        _gx[832] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[64];
                        _gx[576] = s1 - xlxk * s0;
                        s1 = _gx[640];
                        s0 = _gx[384];
                        _gx[896] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[128];
                        _gx[640] = s1 - xlxk * s0;
                        s1 = _gx[704];
                        s0 = _gx[448];
                        _gx[960] = s1 - xlxk * s0;
                        s1 = s0;
                        s0 = _gx[192];
                        _gx[704] = s1 - xlxk * s0;
                    }
                    __syncthreads();
                    switch (gout_id) {
                    case 0:
                    gout0 += gx[960] * gx[1024] * gx[2048];
                    gout1 += gx[832] * gx[1088] * gx[2112];
                    gout2 += gx[768] * gx[1088] * gx[2176];
                    gout3 += gx[640] * gx[1280] * gx[2112];
                    gout4 += gx[512] * gx[1472] * gx[2048];
                    gout5 += gx[704] * gx[1024] * gx[2304];
                    gout6 += gx[576] * gx[1088] * gx[2368];
                    gout7 += gx[512] * gx[1088] * gx[2432];
                    gout8 += gx[384] * gx[1536] * gx[2112];
                    gout9 += gx[256] * gx[1728] * gx[2048];
                    gout10 += gx[192] * gx[1792] * gx[2048];
                    gout11 += gx[64] * gx[1856] * gx[2112];
                    gout12 += gx[0] * gx[1856] * gx[2176];
                    gout13 += gx[128] * gx[1536] * gx[2368];
                    gout14 += gx[0] * gx[1728] * gx[2304];
                    gout15 += gx[448] * gx[1024] * gx[2560];
                    gout16 += gx[320] * gx[1088] * gx[2624];
                    gout17 += gx[256] * gx[1088] * gx[2688];
                    gout18 += gx[128] * gx[1280] * gx[2624];
                    gout19 += gx[0] * gx[1472] * gx[2560];
                    gout20 += gx[192] * gx[1024] * gx[2816];
                    gout21 += gx[64] * gx[1088] * gx[2880];
                    gout22 += gx[0] * gx[1088] * gx[2944];
                    break;
                    case 1:
                    gout0 += gx[896] * gx[1088] * gx[2048];
                    gout1 += gx[832] * gx[1024] * gx[2176];
                    gout2 += gx[768] * gx[1024] * gx[2240];
                    gout3 += gx[576] * gx[1408] * gx[2048];
                    gout4 += gx[512] * gx[1408] * gx[2112];
                    gout5 += gx[640] * gx[1088] * gx[2304];
                    gout6 += gx[576] * gx[1024] * gx[2432];
                    gout7 += gx[512] * gx[1024] * gx[2496];
                    gout8 += gx[320] * gx[1664] * gx[2048];
                    gout9 += gx[256] * gx[1664] * gx[2112];
                    gout10 += gx[128] * gx[1856] * gx[2048];
                    gout11 += gx[64] * gx[1792] * gx[2176];
                    gout12 += gx[0] * gx[1792] * gx[2240];
                    gout13 += gx[64] * gx[1664] * gx[2304];
                    gout14 += gx[0] * gx[1664] * gx[2368];
                    gout15 += gx[384] * gx[1088] * gx[2560];
                    gout16 += gx[320] * gx[1024] * gx[2688];
                    gout17 += gx[256] * gx[1024] * gx[2752];
                    gout18 += gx[64] * gx[1408] * gx[2560];
                    gout19 += gx[0] * gx[1408] * gx[2624];
                    gout20 += gx[128] * gx[1088] * gx[2816];
                    gout21 += gx[64] * gx[1024] * gx[2944];
                    gout22 += gx[0] * gx[1024] * gx[3008];
                    break;
                    case 2:
                    gout0 += gx[896] * gx[1024] * gx[2112];
                    gout1 += gx[768] * gx[1216] * gx[2048];
                    gout2 += gx[704] * gx[1280] * gx[2048];
                    gout3 += gx[576] * gx[1344] * gx[2112];
                    gout4 += gx[512] * gx[1344] * gx[2176];
                    gout5 += gx[640] * gx[1024] * gx[2368];
                    gout6 += gx[512] * gx[1216] * gx[2304];
                    gout7 += gx[448] * gx[1536] * gx[2048];
                    gout8 += gx[320] * gx[1600] * gx[2112];
                    gout9 += gx[256] * gx[1600] * gx[2176];
                    gout10 += gx[128] * gx[1792] * gx[2112];
                    gout11 += gx[0] * gx[1984] * gx[2048];
                    gout12 += gx[192] * gx[1536] * gx[2304];
                    gout13 += gx[64] * gx[1600] * gx[2368];
                    gout14 += gx[0] * gx[1600] * gx[2432];
                    gout15 += gx[384] * gx[1024] * gx[2624];
                    gout16 += gx[256] * gx[1216] * gx[2560];
                    gout17 += gx[192] * gx[1280] * gx[2560];
                    gout18 += gx[64] * gx[1344] * gx[2624];
                    gout19 += gx[0] * gx[1344] * gx[2688];
                    gout20 += gx[128] * gx[1024] * gx[2880];
                    gout21 += gx[0] * gx[1216] * gx[2816];
                    break;
                    case 3:
                    gout0 += gx[832] * gx[1152] * gx[2048];
                    gout1 += gx[768] * gx[1152] * gx[2112];
                    gout2 += gx[640] * gx[1344] * gx[2048];
                    gout3 += gx[576] * gx[1280] * gx[2176];
                    gout4 += gx[512] * gx[1280] * gx[2240];
                    gout5 += gx[576] * gx[1152] * gx[2304];
                    gout6 += gx[512] * gx[1152] * gx[2368];
                    gout7 += gx[384] * gx[1600] * gx[2048];
                    gout8 += gx[320] * gx[1536] * gx[2176];
                    gout9 += gx[256] * gx[1536] * gx[2240];
                    gout10 += gx[64] * gx[1920] * gx[2048];
                    gout11 += gx[0] * gx[1920] * gx[2112];
                    gout12 += gx[128] * gx[1600] * gx[2304];
                    gout13 += gx[64] * gx[1536] * gx[2432];
                    gout14 += gx[0] * gx[1536] * gx[2496];
                    gout15 += gx[320] * gx[1152] * gx[2560];
                    gout16 += gx[256] * gx[1152] * gx[2624];
                    gout17 += gx[128] * gx[1344] * gx[2560];
                    gout18 += gx[64] * gx[1280] * gx[2688];
                    gout19 += gx[0] * gx[1280] * gx[2752];
                    gout20 += gx[64] * gx[1152] * gx[2816];
                    gout21 += gx[0] * gx[1152] * gx[2880];
                    break;
                    }
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                switch (gout_id) {
                case 0:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(k0+0)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+6)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+6)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+8)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(k0+0)];
                val += gout22 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+8)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout5 * dm[(i0+0)*nao+(k0+2)];
                val += gout3 * dm[(i0+2)*nao+(k0+1)];
                val += gout1 * dm[(i0+4)*nao+(k0+0)];
                val += gout6 * dm[(i0+4)*nao+(k0+2)];
                val += gout4 * dm[(i0+6)*nao+(k0+1)];
                val += gout2 * dm[(i0+8)*nao+(k0+0)];
                val += gout7 * dm[(i0+8)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout8 * dm[(i0+2)*nao+(k0+0)];
                val += gout13 * dm[(i0+2)*nao+(k0+2)];
                val += gout11 * dm[(i0+4)*nao+(k0+1)];
                val += gout9 * dm[(i0+6)*nao+(k0+0)];
                val += gout14 * dm[(i0+6)*nao+(k0+2)];
                val += gout12 * dm[(i0+8)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(k0+0)];
                val += gout20 * dm[(i0+0)*nao+(k0+2)];
                val += gout18 * dm[(i0+2)*nao+(k0+1)];
                val += gout16 * dm[(i0+4)*nao+(k0+0)];
                val += gout21 * dm[(i0+4)*nao+(k0+2)];
                val += gout19 * dm[(i0+6)*nao+(k0+1)];
                val += gout17 * dm[(i0+8)*nao+(k0+0)];
                val += gout22 * dm[(i0+8)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+6)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+6)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+8)*nao+(k0+1), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+8)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout15 * dm[(i0+0)*nao+(l0+2)];
                val += gout8 * dm[(i0+2)*nao+(l0+1)];
                val += gout1 * dm[(i0+4)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+2)];
                val += gout9 * dm[(i0+6)*nao+(l0+1)];
                val += gout2 * dm[(i0+8)*nao+(l0+0)];
                val += gout17 * dm[(i0+8)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+1)];
                val += gout3 * dm[(i0+2)*nao+(l0+0)];
                val += gout18 * dm[(i0+2)*nao+(l0+2)];
                val += gout11 * dm[(i0+4)*nao+(l0+1)];
                val += gout4 * dm[(i0+6)*nao+(l0+0)];
                val += gout19 * dm[(i0+6)*nao+(l0+2)];
                val += gout12 * dm[(i0+8)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(i0+0)*nao+(l0+0)];
                val += gout20 * dm[(i0+0)*nao+(l0+2)];
                val += gout13 * dm[(i0+2)*nao+(l0+1)];
                val += gout6 * dm[(i0+4)*nao+(l0+0)];
                val += gout21 * dm[(i0+4)*nao+(l0+2)];
                val += gout14 * dm[(i0+6)*nao+(l0+1)];
                val += gout7 * dm[(i0+8)*nao+(l0+0)];
                val += gout22 * dm[(i0+8)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                break;
                case 1:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(k0+0)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+7)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+7)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout7 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+9)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(k0+0)];
                val += gout22 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+9)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout5 * dm[(i0+1)*nao+(k0+2)];
                val += gout3 * dm[(i0+3)*nao+(k0+1)];
                val += gout1 * dm[(i0+5)*nao+(k0+0)];
                val += gout6 * dm[(i0+5)*nao+(k0+2)];
                val += gout4 * dm[(i0+7)*nao+(k0+1)];
                val += gout2 * dm[(i0+9)*nao+(k0+0)];
                val += gout7 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout8 * dm[(i0+3)*nao+(k0+0)];
                val += gout13 * dm[(i0+3)*nao+(k0+2)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout9 * dm[(i0+7)*nao+(k0+0)];
                val += gout14 * dm[(i0+7)*nao+(k0+2)];
                val += gout12 * dm[(i0+9)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(k0+0)];
                val += gout20 * dm[(i0+1)*nao+(k0+2)];
                val += gout18 * dm[(i0+3)*nao+(k0+1)];
                val += gout16 * dm[(i0+5)*nao+(k0+0)];
                val += gout21 * dm[(i0+5)*nao+(k0+2)];
                val += gout19 * dm[(i0+7)*nao+(k0+1)];
                val += gout17 * dm[(i0+9)*nao+(k0+0)];
                val += gout22 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+7)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+7)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+9)*nao+(k0+1), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+9)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout15 * dm[(i0+1)*nao+(l0+2)];
                val += gout8 * dm[(i0+3)*nao+(l0+1)];
                val += gout1 * dm[(i0+5)*nao+(l0+0)];
                val += gout16 * dm[(i0+5)*nao+(l0+2)];
                val += gout9 * dm[(i0+7)*nao+(l0+1)];
                val += gout2 * dm[(i0+9)*nao+(l0+0)];
                val += gout17 * dm[(i0+9)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(l0+1)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout18 * dm[(i0+3)*nao+(l0+2)];
                val += gout11 * dm[(i0+5)*nao+(l0+1)];
                val += gout4 * dm[(i0+7)*nao+(l0+0)];
                val += gout19 * dm[(i0+7)*nao+(l0+2)];
                val += gout12 * dm[(i0+9)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(i0+1)*nao+(l0+0)];
                val += gout20 * dm[(i0+1)*nao+(l0+2)];
                val += gout13 * dm[(i0+3)*nao+(l0+1)];
                val += gout6 * dm[(i0+5)*nao+(l0+0)];
                val += gout21 * dm[(i0+5)*nao+(l0+2)];
                val += gout14 * dm[(i0+7)*nao+(l0+1)];
                val += gout7 * dm[(i0+9)*nao+(l0+0)];
                val += gout22 * dm[(i0+9)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                break;
                case 2:
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(k0+0)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+6)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+6)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+8)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+8)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(i0+0)*nao+(k0+1)];
                val += gout0 * dm[(i0+2)*nao+(k0+0)];
                val += gout5 * dm[(i0+2)*nao+(k0+2)];
                val += gout3 * dm[(i0+4)*nao+(k0+1)];
                val += gout1 * dm[(i0+6)*nao+(k0+0)];
                val += gout6 * dm[(i0+6)*nao+(k0+2)];
                val += gout4 * dm[(i0+8)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(k0+0)];
                val += gout12 * dm[(i0+0)*nao+(k0+2)];
                val += gout10 * dm[(i0+2)*nao+(k0+1)];
                val += gout8 * dm[(i0+4)*nao+(k0+0)];
                val += gout13 * dm[(i0+4)*nao+(k0+2)];
                val += gout11 * dm[(i0+6)*nao+(k0+1)];
                val += gout9 * dm[(i0+8)*nao+(k0+0)];
                val += gout14 * dm[(i0+8)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(i0+0)*nao+(k0+1)];
                val += gout15 * dm[(i0+2)*nao+(k0+0)];
                val += gout20 * dm[(i0+2)*nao+(k0+2)];
                val += gout18 * dm[(i0+4)*nao+(k0+1)];
                val += gout16 * dm[(i0+6)*nao+(k0+0)];
                val += gout21 * dm[(i0+6)*nao+(k0+2)];
                val += gout19 * dm[(i0+8)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+6)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+6)*nao+(k0+2), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+8)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+8)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(l0+1)];
                val += gout0 * dm[(i0+2)*nao+(l0+0)];
                val += gout15 * dm[(i0+2)*nao+(l0+2)];
                val += gout8 * dm[(i0+4)*nao+(l0+1)];
                val += gout1 * dm[(i0+6)*nao+(l0+0)];
                val += gout16 * dm[(i0+6)*nao+(l0+2)];
                val += gout9 * dm[(i0+8)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+0)*nao+(l0+0)];
                val += gout17 * dm[(i0+0)*nao+(l0+2)];
                val += gout10 * dm[(i0+2)*nao+(l0+1)];
                val += gout3 * dm[(i0+4)*nao+(l0+0)];
                val += gout18 * dm[(i0+4)*nao+(l0+2)];
                val += gout11 * dm[(i0+6)*nao+(l0+1)];
                val += gout4 * dm[(i0+8)*nao+(l0+0)];
                val += gout19 * dm[(i0+8)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+1)];
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                val += gout20 * dm[(i0+2)*nao+(l0+2)];
                val += gout13 * dm[(i0+4)*nao+(l0+1)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                val += gout21 * dm[(i0+6)*nao+(l0+2)];
                val += gout14 * dm[(i0+8)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                break;
                case 3:
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout5 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(k0+0)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+2), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+1), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout6 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+7)*nao+(l0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(k0+0)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+7)*nao+(l0+2), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+2)];
                atomicAdd(vk+(i0+9)*nao+(l0+1), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                atomicAdd(vk+(i0+9)*nao+(l0+2), val);
                val = 0;
                val += gout2 * dm[(i0+1)*nao+(k0+1)];
                val += gout0 * dm[(i0+3)*nao+(k0+0)];
                val += gout5 * dm[(i0+3)*nao+(k0+2)];
                val += gout3 * dm[(i0+5)*nao+(k0+1)];
                val += gout1 * dm[(i0+7)*nao+(k0+0)];
                val += gout6 * dm[(i0+7)*nao+(k0+2)];
                val += gout4 * dm[(i0+9)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+1)*nao+(k0+2)];
                val += gout10 * dm[(i0+3)*nao+(k0+1)];
                val += gout8 * dm[(i0+5)*nao+(k0+0)];
                val += gout13 * dm[(i0+5)*nao+(k0+2)];
                val += gout11 * dm[(i0+7)*nao+(k0+1)];
                val += gout9 * dm[(i0+9)*nao+(k0+0)];
                val += gout14 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+1), val);
                val = 0;
                val += gout17 * dm[(i0+1)*nao+(k0+1)];
                val += gout15 * dm[(i0+3)*nao+(k0+0)];
                val += gout20 * dm[(i0+3)*nao+(k0+2)];
                val += gout18 * dm[(i0+5)*nao+(k0+1)];
                val += gout16 * dm[(i0+7)*nao+(k0+0)];
                val += gout21 * dm[(i0+7)*nao+(k0+2)];
                val += gout19 * dm[(i0+9)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+2), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+7)*nao+(k0+1), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+7)*nao+(k0+2), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+0)*nao+(l0+2)];
                atomicAdd(vk+(i0+9)*nao+(k0+1), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+1)];
                atomicAdd(vk+(i0+9)*nao+(k0+2), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(l0+1)];
                val += gout0 * dm[(i0+3)*nao+(l0+0)];
                val += gout15 * dm[(i0+3)*nao+(l0+2)];
                val += gout8 * dm[(i0+5)*nao+(l0+1)];
                val += gout1 * dm[(i0+7)*nao+(l0+0)];
                val += gout16 * dm[(i0+7)*nao+(l0+2)];
                val += gout9 * dm[(i0+9)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(i0+1)*nao+(l0+0)];
                val += gout17 * dm[(i0+1)*nao+(l0+2)];
                val += gout10 * dm[(i0+3)*nao+(l0+1)];
                val += gout3 * dm[(i0+5)*nao+(l0+0)];
                val += gout18 * dm[(i0+5)*nao+(l0+2)];
                val += gout11 * dm[(i0+7)*nao+(l0+1)];
                val += gout4 * dm[(i0+9)*nao+(l0+0)];
                val += gout19 * dm[(i0+9)*nao+(l0+2)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(l0+1)];
                val += gout5 * dm[(i0+3)*nao+(l0+0)];
                val += gout20 * dm[(i0+3)*nao+(l0+2)];
                val += gout13 * dm[(i0+5)*nao+(l0+1)];
                val += gout6 * dm[(i0+7)*nao+(l0+0)];
                val += gout21 * dm[(i0+7)*nao+(l0+2)];
                val += gout14 * dm[(i0+9)*nao+(l0+1)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                break;
                }
            }
        }
    }
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_3020(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        double gout36;
        double gout37;
        double gout38;
        double gout39;
        double gout40;
        double gout41;
        double gout42;
        double gout43;
        double gout44;
        double gout45;
        double gout46;
        double gout47;
        double gout48;
        double gout49;
        double gout50;
        double gout51;
        double gout52;
        double gout53;
        double gout54;
        double gout55;
        double gout56;
        double gout57;
        double gout58;
        double gout59;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        gout36 = 0;
        gout37 = 0;
        gout38 = 0;
        gout39 = 0;
        gout40 = 0;
        gout41 = 0;
        gout42 = 0;
        gout43 = 0;
        gout44 = 0;
        gout45 = 0;
        gout46 = 0;
        gout47 = 0;
        gout48 = 0;
        gout49 = 0;
        gout50 = 0;
        gout51 = 0;
        gout52 = 0;
        gout53 = 0;
        gout54 = 0;
        gout55 = 0;
        gout56 = 0;
        gout57 = 0;
        gout58 = 0;
        gout59 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double b00 = .5 * rt_aa;
                    double rt_akl = rt_aa * aij;
                    double b01 = .5/akl * (1 - rt_akl);
                    double cpx = xlxk*al_akl + xpq*rt_akl;
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double trr_31x = cpx * trr_30x + 3*b00 * trr_20x;
                    double trr_21x = cpx * trr_20x + 2*b00 * trr_10x;
                    double trr_32x = cpx * trr_31x + 1*b01 * trr_30x + 3*b00 * trr_21x;
                    gout0 += trr_32x * fac * wt;
                    double trr_11x = cpx * trr_10x + 1*b00 * 1;
                    double trr_22x = cpx * trr_21x + 1*b01 * trr_20x + 2*b00 * trr_11x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += trr_22x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += trr_22x * fac * trr_10z;
                    double trr_01x = cpx * 1;
                    double trr_12x = cpx * trr_11x + 1*b01 * trr_10x + 1*b00 * trr_01x;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += trr_12x * trr_20y * wt;
                    gout4 += trr_12x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += trr_12x * fac * trr_20z;
                    double trr_02x = cpx * trr_01x + 1*b01 * 1;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    gout6 += trr_02x * trr_30y * wt;
                    gout7 += trr_02x * trr_20y * trr_10z;
                    gout8 += trr_02x * trr_10y * trr_20z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    gout9 += trr_02x * fac * trr_30z;
                    double cpy = ylyk*al_akl + ypq*rt_akl;
                    double trr_01y = cpy * fac;
                    gout10 += trr_31x * trr_01y * wt;
                    double trr_11y = cpy * trr_10y + 1*b00 * fac;
                    gout11 += trr_21x * trr_11y * wt;
                    gout12 += trr_21x * trr_01y * trr_10z;
                    double trr_21y = cpy * trr_20y + 2*b00 * trr_10y;
                    gout13 += trr_11x * trr_21y * wt;
                    gout14 += trr_11x * trr_11y * trr_10z;
                    gout15 += trr_11x * trr_01y * trr_20z;
                    double trr_31y = cpy * trr_30y + 3*b00 * trr_20y;
                    gout16 += trr_01x * trr_31y * wt;
                    gout17 += trr_01x * trr_21y * trr_10z;
                    gout18 += trr_01x * trr_11y * trr_20z;
                    gout19 += trr_01x * trr_01y * trr_30z;
                    double cpz = zlzk*al_akl + zpq*rt_akl;
                    double trr_01z = cpz * wt;
                    gout20 += trr_31x * fac * trr_01z;
                    gout21 += trr_21x * trr_10y * trr_01z;
                    double trr_11z = cpz * trr_10z + 1*b00 * wt;
                    gout22 += trr_21x * fac * trr_11z;
                    gout23 += trr_11x * trr_20y * trr_01z;
                    gout24 += trr_11x * trr_10y * trr_11z;
                    double trr_21z = cpz * trr_20z + 2*b00 * trr_10z;
                    gout25 += trr_11x * fac * trr_21z;
                    gout26 += trr_01x * trr_30y * trr_01z;
                    gout27 += trr_01x * trr_20y * trr_11z;
                    gout28 += trr_01x * trr_10y * trr_21z;
                    double trr_31z = cpz * trr_30z + 3*b00 * trr_20z;
                    gout29 += trr_01x * fac * trr_31z;
                    double trr_02y = cpy * trr_01y + 1*b01 * fac;
                    gout30 += trr_30x * trr_02y * wt;
                    double trr_12y = cpy * trr_11y + 1*b01 * trr_10y + 1*b00 * trr_01y;
                    gout31 += trr_20x * trr_12y * wt;
                    gout32 += trr_20x * trr_02y * trr_10z;
                    double trr_22y = cpy * trr_21y + 1*b01 * trr_20y + 2*b00 * trr_11y;
                    gout33 += trr_10x * trr_22y * wt;
                    gout34 += trr_10x * trr_12y * trr_10z;
                    gout35 += trr_10x * trr_02y * trr_20z;
                    double trr_32y = cpy * trr_31y + 1*b01 * trr_30y + 3*b00 * trr_21y;
                    gout36 += 1 * trr_32y * wt;
                    gout37 += 1 * trr_22y * trr_10z;
                    gout38 += 1 * trr_12y * trr_20z;
                    gout39 += 1 * trr_02y * trr_30z;
                    gout40 += trr_30x * trr_01y * trr_01z;
                    gout41 += trr_20x * trr_11y * trr_01z;
                    gout42 += trr_20x * trr_01y * trr_11z;
                    gout43 += trr_10x * trr_21y * trr_01z;
                    gout44 += trr_10x * trr_11y * trr_11z;
                    gout45 += trr_10x * trr_01y * trr_21z;
                    gout46 += 1 * trr_31y * trr_01z;
                    gout47 += 1 * trr_21y * trr_11z;
                    gout48 += 1 * trr_11y * trr_21z;
                    gout49 += 1 * trr_01y * trr_31z;
                    double trr_02z = cpz * trr_01z + 1*b01 * wt;
                    gout50 += trr_30x * fac * trr_02z;
                    gout51 += trr_20x * trr_10y * trr_02z;
                    double trr_12z = cpz * trr_11z + 1*b01 * trr_10z + 1*b00 * trr_01z;
                    gout52 += trr_20x * fac * trr_12z;
                    gout53 += trr_10x * trr_20y * trr_02z;
                    gout54 += trr_10x * trr_10y * trr_12z;
                    double trr_22z = cpz * trr_21z + 1*b01 * trr_20z + 2*b00 * trr_11z;
                    gout55 += trr_10x * fac * trr_22z;
                    gout56 += 1 * trr_30y * trr_02z;
                    gout57 += 1 * trr_20y * trr_12z;
                    gout58 += 1 * trr_10y * trr_22z;
                    double trr_32z = cpz * trr_31z + 1*b01 * trr_30z + 3*b00 * trr_21z;
                    gout59 += 1 * fac * trr_32z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+0)*nao+(k0+1)];
                val += gout20 * dm[(j0+0)*nao+(k0+2)];
                val += gout30 * dm[(j0+0)*nao+(k0+3)];
                val += gout40 * dm[(j0+0)*nao+(k0+4)];
                val += gout50 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+0)*nao+(k0+1)];
                val += gout21 * dm[(j0+0)*nao+(k0+2)];
                val += gout31 * dm[(j0+0)*nao+(k0+3)];
                val += gout41 * dm[(j0+0)*nao+(k0+4)];
                val += gout51 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+0)*nao+(k0+1)];
                val += gout22 * dm[(j0+0)*nao+(k0+2)];
                val += gout32 * dm[(j0+0)*nao+(k0+3)];
                val += gout42 * dm[(j0+0)*nao+(k0+4)];
                val += gout52 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+0)*nao+(k0+1)];
                val += gout23 * dm[(j0+0)*nao+(k0+2)];
                val += gout33 * dm[(j0+0)*nao+(k0+3)];
                val += gout43 * dm[(j0+0)*nao+(k0+4)];
                val += gout53 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+0)*nao+(k0+1)];
                val += gout24 * dm[(j0+0)*nao+(k0+2)];
                val += gout34 * dm[(j0+0)*nao+(k0+3)];
                val += gout44 * dm[(j0+0)*nao+(k0+4)];
                val += gout54 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+0)*nao+(k0+1)];
                val += gout25 * dm[(j0+0)*nao+(k0+2)];
                val += gout35 * dm[(j0+0)*nao+(k0+3)];
                val += gout45 * dm[(j0+0)*nao+(k0+4)];
                val += gout55 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+0)*nao+(k0+1)];
                val += gout26 * dm[(j0+0)*nao+(k0+2)];
                val += gout36 * dm[(j0+0)*nao+(k0+3)];
                val += gout46 * dm[(j0+0)*nao+(k0+4)];
                val += gout56 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+0)*nao+(k0+1)];
                val += gout27 * dm[(j0+0)*nao+(k0+2)];
                val += gout37 * dm[(j0+0)*nao+(k0+3)];
                val += gout47 * dm[(j0+0)*nao+(k0+4)];
                val += gout57 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout18 * dm[(j0+0)*nao+(k0+1)];
                val += gout28 * dm[(j0+0)*nao+(k0+2)];
                val += gout38 * dm[(j0+0)*nao+(k0+3)];
                val += gout48 * dm[(j0+0)*nao+(k0+4)];
                val += gout58 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout19 * dm[(j0+0)*nao+(k0+1)];
                val += gout29 * dm[(j0+0)*nao+(k0+2)];
                val += gout39 * dm[(j0+0)*nao+(k0+3)];
                val += gout49 * dm[(j0+0)*nao+(k0+4)];
                val += gout59 * dm[(j0+0)*nao+(k0+5)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout20 * dm[(i0+0)*nao+(k0+2)];
                val += gout30 * dm[(i0+0)*nao+(k0+3)];
                val += gout40 * dm[(i0+0)*nao+(k0+4)];
                val += gout50 * dm[(i0+0)*nao+(k0+5)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout11 * dm[(i0+1)*nao+(k0+1)];
                val += gout21 * dm[(i0+1)*nao+(k0+2)];
                val += gout31 * dm[(i0+1)*nao+(k0+3)];
                val += gout41 * dm[(i0+1)*nao+(k0+4)];
                val += gout51 * dm[(i0+1)*nao+(k0+5)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+1)];
                val += gout22 * dm[(i0+2)*nao+(k0+2)];
                val += gout32 * dm[(i0+2)*nao+(k0+3)];
                val += gout42 * dm[(i0+2)*nao+(k0+4)];
                val += gout52 * dm[(i0+2)*nao+(k0+5)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout13 * dm[(i0+3)*nao+(k0+1)];
                val += gout23 * dm[(i0+3)*nao+(k0+2)];
                val += gout33 * dm[(i0+3)*nao+(k0+3)];
                val += gout43 * dm[(i0+3)*nao+(k0+4)];
                val += gout53 * dm[(i0+3)*nao+(k0+5)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout14 * dm[(i0+4)*nao+(k0+1)];
                val += gout24 * dm[(i0+4)*nao+(k0+2)];
                val += gout34 * dm[(i0+4)*nao+(k0+3)];
                val += gout44 * dm[(i0+4)*nao+(k0+4)];
                val += gout54 * dm[(i0+4)*nao+(k0+5)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout15 * dm[(i0+5)*nao+(k0+1)];
                val += gout25 * dm[(i0+5)*nao+(k0+2)];
                val += gout35 * dm[(i0+5)*nao+(k0+3)];
                val += gout45 * dm[(i0+5)*nao+(k0+4)];
                val += gout55 * dm[(i0+5)*nao+(k0+5)];
                val += gout6 * dm[(i0+6)*nao+(k0+0)];
                val += gout16 * dm[(i0+6)*nao+(k0+1)];
                val += gout26 * dm[(i0+6)*nao+(k0+2)];
                val += gout36 * dm[(i0+6)*nao+(k0+3)];
                val += gout46 * dm[(i0+6)*nao+(k0+4)];
                val += gout56 * dm[(i0+6)*nao+(k0+5)];
                val += gout7 * dm[(i0+7)*nao+(k0+0)];
                val += gout17 * dm[(i0+7)*nao+(k0+1)];
                val += gout27 * dm[(i0+7)*nao+(k0+2)];
                val += gout37 * dm[(i0+7)*nao+(k0+3)];
                val += gout47 * dm[(i0+7)*nao+(k0+4)];
                val += gout57 * dm[(i0+7)*nao+(k0+5)];
                val += gout8 * dm[(i0+8)*nao+(k0+0)];
                val += gout18 * dm[(i0+8)*nao+(k0+1)];
                val += gout28 * dm[(i0+8)*nao+(k0+2)];
                val += gout38 * dm[(i0+8)*nao+(k0+3)];
                val += gout48 * dm[(i0+8)*nao+(k0+4)];
                val += gout58 * dm[(i0+8)*nao+(k0+5)];
                val += gout9 * dm[(i0+9)*nao+(k0+0)];
                val += gout19 * dm[(i0+9)*nao+(k0+1)];
                val += gout29 * dm[(i0+9)*nao+(k0+2)];
                val += gout39 * dm[(i0+9)*nao+(k0+3)];
                val += gout49 * dm[(i0+9)*nao+(k0+4)];
                val += gout59 * dm[(i0+9)*nao+(k0+5)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout30 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+3), val);
                val = 0;
                val += gout40 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+4), val);
                val = 0;
                val += gout50 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+5), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout21 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout31 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+3), val);
                val = 0;
                val += gout41 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+4), val);
                val = 0;
                val += gout51 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+5), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout22 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout32 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+3), val);
                val = 0;
                val += gout42 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+4), val);
                val = 0;
                val += gout52 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+5), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout13 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout23 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout33 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+3), val);
                val = 0;
                val += gout43 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+4), val);
                val = 0;
                val += gout53 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+5), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout14 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout24 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout34 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+3), val);
                val = 0;
                val += gout44 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+4), val);
                val = 0;
                val += gout54 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+5), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout25 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout35 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+3), val);
                val = 0;
                val += gout45 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+4), val);
                val = 0;
                val += gout55 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+5), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+1), val);
                val = 0;
                val += gout26 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+2), val);
                val = 0;
                val += gout36 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+3), val);
                val = 0;
                val += gout46 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+4), val);
                val = 0;
                val += gout56 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+5), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+1), val);
                val = 0;
                val += gout27 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+2), val);
                val = 0;
                val += gout37 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+3), val);
                val = 0;
                val += gout47 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+4), val);
                val = 0;
                val += gout57 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+5), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout18 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+1), val);
                val = 0;
                val += gout28 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+2), val);
                val = 0;
                val += gout38 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+3), val);
                val = 0;
                val += gout48 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+4), val);
                val = 0;
                val += gout58 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+5), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout19 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+1), val);
                val = 0;
                val += gout29 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+2), val);
                val = 0;
                val += gout39 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+3), val);
                val = 0;
                val += gout49 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+4), val);
                val = 0;
                val += gout59 * dm[(j0+0)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+5), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                val += gout7 * dm[(i0+7)*nao+(l0+0)];
                val += gout8 * dm[(i0+8)*nao+(l0+0)];
                val += gout9 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+1)*nao+(l0+0)];
                val += gout12 * dm[(i0+2)*nao+(l0+0)];
                val += gout13 * dm[(i0+3)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+0)];
                val += gout15 * dm[(i0+5)*nao+(l0+0)];
                val += gout16 * dm[(i0+6)*nao+(l0+0)];
                val += gout17 * dm[(i0+7)*nao+(l0+0)];
                val += gout18 * dm[(i0+8)*nao+(l0+0)];
                val += gout19 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(l0+0)];
                val += gout21 * dm[(i0+1)*nao+(l0+0)];
                val += gout22 * dm[(i0+2)*nao+(l0+0)];
                val += gout23 * dm[(i0+3)*nao+(l0+0)];
                val += gout24 * dm[(i0+4)*nao+(l0+0)];
                val += gout25 * dm[(i0+5)*nao+(l0+0)];
                val += gout26 * dm[(i0+6)*nao+(l0+0)];
                val += gout27 * dm[(i0+7)*nao+(l0+0)];
                val += gout28 * dm[(i0+8)*nao+(l0+0)];
                val += gout29 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+0)];
                val += gout33 * dm[(i0+3)*nao+(l0+0)];
                val += gout34 * dm[(i0+4)*nao+(l0+0)];
                val += gout35 * dm[(i0+5)*nao+(l0+0)];
                val += gout36 * dm[(i0+6)*nao+(l0+0)];
                val += gout37 * dm[(i0+7)*nao+(l0+0)];
                val += gout38 * dm[(i0+8)*nao+(l0+0)];
                val += gout39 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+3), val);
                val = 0;
                val += gout40 * dm[(i0+0)*nao+(l0+0)];
                val += gout41 * dm[(i0+1)*nao+(l0+0)];
                val += gout42 * dm[(i0+2)*nao+(l0+0)];
                val += gout43 * dm[(i0+3)*nao+(l0+0)];
                val += gout44 * dm[(i0+4)*nao+(l0+0)];
                val += gout45 * dm[(i0+5)*nao+(l0+0)];
                val += gout46 * dm[(i0+6)*nao+(l0+0)];
                val += gout47 * dm[(i0+7)*nao+(l0+0)];
                val += gout48 * dm[(i0+8)*nao+(l0+0)];
                val += gout49 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+4), val);
                val = 0;
                val += gout50 * dm[(i0+0)*nao+(l0+0)];
                val += gout51 * dm[(i0+1)*nao+(l0+0)];
                val += gout52 * dm[(i0+2)*nao+(l0+0)];
                val += gout53 * dm[(i0+3)*nao+(l0+0)];
                val += gout54 * dm[(i0+4)*nao+(l0+0)];
                val += gout55 * dm[(i0+5)*nao+(l0+0)];
                val += gout56 * dm[(i0+6)*nao+(l0+0)];
                val += gout57 * dm[(i0+7)*nao+(l0+0)];
                val += gout58 * dm[(i0+8)*nao+(l0+0)];
                val += gout59 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+5), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_3100(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double trr_40x = c0x * trr_30x + 3*b10 * trr_20x;
                    double hrr_3100x = trr_40x - xjxi * trr_30x;
                    gout0 += hrr_3100x * fac * wt;
                    double hrr_2100x = trr_30x - xjxi * trr_20x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_2100x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_2100x * fac * trr_10z;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += hrr_1100x * trr_20y * wt;
                    gout4 += hrr_1100x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += hrr_1100x * fac * trr_20z;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    gout6 += hrr_0100x * trr_30y * wt;
                    gout7 += hrr_0100x * trr_20y * trr_10z;
                    gout8 += hrr_0100x * trr_10y * trr_20z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    gout9 += hrr_0100x * fac * trr_30z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout10 += trr_30x * hrr_0100y * wt;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout11 += trr_20x * hrr_1100y * wt;
                    gout12 += trr_20x * hrr_0100y * trr_10z;
                    double hrr_2100y = trr_30y - yjyi * trr_20y;
                    gout13 += trr_10x * hrr_2100y * wt;
                    gout14 += trr_10x * hrr_1100y * trr_10z;
                    gout15 += trr_10x * hrr_0100y * trr_20z;
                    double trr_40y = c0y * trr_30y + 3*b10 * trr_20y;
                    double hrr_3100y = trr_40y - yjyi * trr_30y;
                    gout16 += 1 * hrr_3100y * wt;
                    gout17 += 1 * hrr_2100y * trr_10z;
                    gout18 += 1 * hrr_1100y * trr_20z;
                    gout19 += 1 * hrr_0100y * trr_30z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout20 += trr_30x * fac * hrr_0100z;
                    gout21 += trr_20x * trr_10y * hrr_0100z;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout22 += trr_20x * fac * hrr_1100z;
                    gout23 += trr_10x * trr_20y * hrr_0100z;
                    gout24 += trr_10x * trr_10y * hrr_1100z;
                    double hrr_2100z = trr_30z - zjzi * trr_20z;
                    gout25 += trr_10x * fac * hrr_2100z;
                    gout26 += 1 * trr_30y * hrr_0100z;
                    gout27 += 1 * trr_20y * hrr_1100z;
                    gout28 += 1 * trr_10y * hrr_2100z;
                    double trr_40z = c0z * trr_30z + 3*b10 * trr_20z;
                    double hrr_3100z = trr_40z - zjzi * trr_30z;
                    gout29 += 1 * fac * hrr_3100z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+0)];
                val += gout20 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+0)];
                val += gout21 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+1)*nao+(k0+0)];
                val += gout22 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+1)*nao+(k0+0)];
                val += gout23 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout24 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+1)*nao+(k0+0)];
                val += gout25 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+1)*nao+(k0+0)];
                val += gout26 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+1)*nao+(k0+0)];
                val += gout27 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout18 * dm[(j0+1)*nao+(k0+0)];
                val += gout28 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout19 * dm[(j0+1)*nao+(k0+0)];
                val += gout29 * dm[(j0+2)*nao+(k0+0)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout6 * dm[(i0+6)*nao+(k0+0)];
                val += gout7 * dm[(i0+7)*nao+(k0+0)];
                val += gout8 * dm[(i0+8)*nao+(k0+0)];
                val += gout9 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(k0+0)];
                val += gout11 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+0)];
                val += gout13 * dm[(i0+3)*nao+(k0+0)];
                val += gout14 * dm[(i0+4)*nao+(k0+0)];
                val += gout15 * dm[(i0+5)*nao+(k0+0)];
                val += gout16 * dm[(i0+6)*nao+(k0+0)];
                val += gout17 * dm[(i0+7)*nao+(k0+0)];
                val += gout18 * dm[(i0+8)*nao+(k0+0)];
                val += gout19 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(k0+0)];
                val += gout21 * dm[(i0+1)*nao+(k0+0)];
                val += gout22 * dm[(i0+2)*nao+(k0+0)];
                val += gout23 * dm[(i0+3)*nao+(k0+0)];
                val += gout24 * dm[(i0+4)*nao+(k0+0)];
                val += gout25 * dm[(i0+5)*nao+(k0+0)];
                val += gout26 * dm[(i0+6)*nao+(k0+0)];
                val += gout27 * dm[(i0+7)*nao+(k0+0)];
                val += gout28 * dm[(i0+8)*nao+(k0+0)];
                val += gout29 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+1)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+1)*nao+(l0+0)];
                val += gout23 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+1)*nao+(l0+0)];
                val += gout24 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+1)*nao+(l0+0)];
                val += gout25 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+1)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+1)*nao+(l0+0)];
                val += gout27 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+1)*nao+(l0+0)];
                val += gout28 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                val += gout29 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                val += gout7 * dm[(i0+7)*nao+(l0+0)];
                val += gout8 * dm[(i0+8)*nao+(l0+0)];
                val += gout9 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+1)*nao+(l0+0)];
                val += gout12 * dm[(i0+2)*nao+(l0+0)];
                val += gout13 * dm[(i0+3)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+0)];
                val += gout15 * dm[(i0+5)*nao+(l0+0)];
                val += gout16 * dm[(i0+6)*nao+(l0+0)];
                val += gout17 * dm[(i0+7)*nao+(l0+0)];
                val += gout18 * dm[(i0+8)*nao+(l0+0)];
                val += gout19 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(l0+0)];
                val += gout21 * dm[(i0+1)*nao+(l0+0)];
                val += gout22 * dm[(i0+2)*nao+(l0+0)];
                val += gout23 * dm[(i0+3)*nao+(l0+0)];
                val += gout24 * dm[(i0+4)*nao+(l0+0)];
                val += gout25 * dm[(i0+5)*nao+(l0+0)];
                val += gout26 * dm[(i0+6)*nao+(l0+0)];
                val += gout27 * dm[(i0+7)*nao+(l0+0)];
                val += gout28 * dm[(i0+8)*nao+(l0+0)];
                val += gout29 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_3110(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int gout_id = threadIdx.y;
    int thread_id = 64 * gout_id + sq_id;
    int threads = 256;
    constexpr int nsq_per_block = 64;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (thread_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (thread_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    constexpr int g_size = 16;

    extern __shared__ double shared_memory[];
    double *rlrk = shared_memory + sq_id;
    double *Rpq = shared_memory + nsq_per_block * 3 + sq_id;
    double *akl_cache = shared_memory + nsq_per_block * 6 + sq_id;
    double *fac_ijkl = shared_memory + nsq_per_block * 8 + sq_id;
    double *gx = shared_memory + nsq_per_block * 9 + sq_id;
    double *rw = shared_memory + nsq_per_block * (g_size*3+9) + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * (g_size*3+bounds.nroots*2+9);

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double aij_cache[2];
    __shared__ double *expi;
    __shared__ double *expj;
    if (thread_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (thread_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[thread_id] = env[ri_ptr+thread_id];
        rjri[thread_id] = env[rj_ptr+thread_id] - ri[thread_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = thread_id; ij < iprim*jprim; ij += threads) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }

    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        int iprim = bounds.iprim;
        int jprim = bounds.jprim;
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        if (gout_id == 0) {
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            rlrk[0] = xlxk;
            rlrk[64] = ylyk;
            rlrk[128] = zlzk;
            fac_ijkl[0] = fac_sym;
        }
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            __syncthreads();
            if (gout_id == 0) {
                double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
                double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
                double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
                double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
                int kp = klp / lprim;
                int lp = klp % lprim;
                double ak = expk[kp];
                double al = expl[lp];
                double akl = ak + al;
                double al_akl = al / akl;
                double xlxk = rlrk[0];
                double ylyk = rlrk[64];
                double zlzk = rlrk[128];
                double theta_kl = ak * al_akl;
                double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
                double ckcl = ck[kp] * cl[lp] * Kcd;
                double fac_sym = fac_ijkl[0];
                gx[0] = fac_sym * ckcl;
                akl_cache[0] = akl;
                akl_cache[nsq_per_block] = al_akl;
            }
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double akl = akl_cache[0];
                double al_akl = akl_cache[nsq_per_block];
                double xij = ri[0] + (rjri[0]) * aj_aij;
                double yij = ri[1] + (rjri[1]) * aj_aij;
                double zij = ri[2] + (rjri[2]) * aj_aij;
                double xkl = rk[0] + rlrk[0*nsq_per_block] * al_akl;
                double ykl = rk[1] + rlrk[1*nsq_per_block] * al_akl;
                double zkl = rk[2] + rlrk[2*nsq_per_block] * al_akl;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                if (gout_id == 0) {
                    Rpq[0*nsq_per_block] = xpq;
                    Rpq[1*nsq_per_block] = ypq;
                    Rpq[2*nsq_per_block] = zpq;
                    double cicj = cicj_cache[ijp];
                    gx[nsq_per_block*g_size] = cicj / (aij*akl*sqrt(aij+akl));
                    if (sq_id == 0) {
                        aij_cache[0] = aij;
                        aij_cache[1] = aj_aij;
                    }
                }
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                for (int irys = 0; irys < nroots; ++irys) {
                    __syncthreads();
                    double s0, s1, s2;
                    double rt = rw[irys*128];
                    double aij = aij_cache[0];
                    double rt_aa = rt / (aij + akl);
                    double akl = akl_cache[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double rt_akl = rt_aa * aij;
                    double b00 = .5 * rt_aa;
                    for (int n = gout_id; n < 3; n += 4) {
                        if (n == 2) {
                            gx[2048] = rw[irys*128+64];
                        }
                        double *_gx = gx + n * 1024;
                        double xjxi = rjri[n];
                        double Rpa = xjxi * aij_cache[1];
                        double c0x = Rpa - rt_aij * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = c0x * s0;
                        _gx[64] = s1;
                        s2 = c0x * s1 + 1 * b10 * s0;
                        _gx[128] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 2 * b10 * s0;
                        _gx[192] = s2;
                        s0 = s1;
                        s1 = s2;
                        s2 = c0x * s1 + 3 * b10 * s0;
                        _gx[256] = s2;
                        double xlxk = rlrk[n*64];
                        double Rqc = xlxk * akl_cache[64];
                        double cpx = Rqc + rt_akl * Rpq[n*64];
                        s0 = _gx[0];
                        s1 = cpx * s0;
                        _gx[512] = s1;
                        s0 = _gx[64];
                        s1 = cpx * s0;
                        s1 += 1 * b00 * _gx[0];
                        _gx[576] = s1;
                        s0 = _gx[128];
                        s1 = cpx * s0;
                        s1 += 2 * b00 * _gx[64];
                        _gx[640] = s1;
                        s0 = _gx[192];
                        s1 = cpx * s0;
                        s1 += 3 * b00 * _gx[128];
                        _gx[704] = s1;
                        s0 = _gx[256];
                        s1 = cpx * s0;
                        s1 += 4 * b00 * _gx[192];
                        _gx[768] = s1;
                        s1 = _gx[256];
                        s0 = _gx[192];
                        _gx[448] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[128];
                        _gx[384] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[64];
                        _gx[320] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[0];
                        _gx[256] = s1 - xjxi * s0;
                        s1 = _gx[768];
                        s0 = _gx[704];
                        _gx[960] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[640];
                        _gx[896] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[576];
                        _gx[832] = s1 - xjxi * s0;
                        s1 = s0;
                        s0 = _gx[512];
                        _gx[768] = s1 - xjxi * s0;
                    }
                    __syncthreads();
                    switch (gout_id) {
                    case 0:
                    gout0 += gx[960] * gx[1024] * gx[2048];
                    gout1 += gx[832] * gx[1088] * gx[2112];
                    gout2 += gx[768] * gx[1088] * gx[2176];
                    gout3 += gx[640] * gx[1280] * gx[2112];
                    gout4 += gx[512] * gx[1472] * gx[2048];
                    gout5 += gx[704] * gx[1024] * gx[2304];
                    gout6 += gx[576] * gx[1088] * gx[2368];
                    gout7 += gx[512] * gx[1088] * gx[2432];
                    gout8 += gx[384] * gx[1536] * gx[2112];
                    gout9 += gx[256] * gx[1728] * gx[2048];
                    gout10 += gx[192] * gx[1792] * gx[2048];
                    gout11 += gx[64] * gx[1856] * gx[2112];
                    gout12 += gx[0] * gx[1856] * gx[2176];
                    gout13 += gx[128] * gx[1536] * gx[2368];
                    gout14 += gx[0] * gx[1728] * gx[2304];
                    gout15 += gx[448] * gx[1024] * gx[2560];
                    gout16 += gx[320] * gx[1088] * gx[2624];
                    gout17 += gx[256] * gx[1088] * gx[2688];
                    gout18 += gx[128] * gx[1280] * gx[2624];
                    gout19 += gx[0] * gx[1472] * gx[2560];
                    gout20 += gx[192] * gx[1024] * gx[2816];
                    gout21 += gx[64] * gx[1088] * gx[2880];
                    gout22 += gx[0] * gx[1088] * gx[2944];
                    break;
                    case 1:
                    gout0 += gx[896] * gx[1088] * gx[2048];
                    gout1 += gx[832] * gx[1024] * gx[2176];
                    gout2 += gx[768] * gx[1024] * gx[2240];
                    gout3 += gx[576] * gx[1408] * gx[2048];
                    gout4 += gx[512] * gx[1408] * gx[2112];
                    gout5 += gx[640] * gx[1088] * gx[2304];
                    gout6 += gx[576] * gx[1024] * gx[2432];
                    gout7 += gx[512] * gx[1024] * gx[2496];
                    gout8 += gx[320] * gx[1664] * gx[2048];
                    gout9 += gx[256] * gx[1664] * gx[2112];
                    gout10 += gx[128] * gx[1856] * gx[2048];
                    gout11 += gx[64] * gx[1792] * gx[2176];
                    gout12 += gx[0] * gx[1792] * gx[2240];
                    gout13 += gx[64] * gx[1664] * gx[2304];
                    gout14 += gx[0] * gx[1664] * gx[2368];
                    gout15 += gx[384] * gx[1088] * gx[2560];
                    gout16 += gx[320] * gx[1024] * gx[2688];
                    gout17 += gx[256] * gx[1024] * gx[2752];
                    gout18 += gx[64] * gx[1408] * gx[2560];
                    gout19 += gx[0] * gx[1408] * gx[2624];
                    gout20 += gx[128] * gx[1088] * gx[2816];
                    gout21 += gx[64] * gx[1024] * gx[2944];
                    gout22 += gx[0] * gx[1024] * gx[3008];
                    break;
                    case 2:
                    gout0 += gx[896] * gx[1024] * gx[2112];
                    gout1 += gx[768] * gx[1216] * gx[2048];
                    gout2 += gx[704] * gx[1280] * gx[2048];
                    gout3 += gx[576] * gx[1344] * gx[2112];
                    gout4 += gx[512] * gx[1344] * gx[2176];
                    gout5 += gx[640] * gx[1024] * gx[2368];
                    gout6 += gx[512] * gx[1216] * gx[2304];
                    gout7 += gx[448] * gx[1536] * gx[2048];
                    gout8 += gx[320] * gx[1600] * gx[2112];
                    gout9 += gx[256] * gx[1600] * gx[2176];
                    gout10 += gx[128] * gx[1792] * gx[2112];
                    gout11 += gx[0] * gx[1984] * gx[2048];
                    gout12 += gx[192] * gx[1536] * gx[2304];
                    gout13 += gx[64] * gx[1600] * gx[2368];
                    gout14 += gx[0] * gx[1600] * gx[2432];
                    gout15 += gx[384] * gx[1024] * gx[2624];
                    gout16 += gx[256] * gx[1216] * gx[2560];
                    gout17 += gx[192] * gx[1280] * gx[2560];
                    gout18 += gx[64] * gx[1344] * gx[2624];
                    gout19 += gx[0] * gx[1344] * gx[2688];
                    gout20 += gx[128] * gx[1024] * gx[2880];
                    gout21 += gx[0] * gx[1216] * gx[2816];
                    break;
                    case 3:
                    gout0 += gx[832] * gx[1152] * gx[2048];
                    gout1 += gx[768] * gx[1152] * gx[2112];
                    gout2 += gx[640] * gx[1344] * gx[2048];
                    gout3 += gx[576] * gx[1280] * gx[2176];
                    gout4 += gx[512] * gx[1280] * gx[2240];
                    gout5 += gx[576] * gx[1152] * gx[2304];
                    gout6 += gx[512] * gx[1152] * gx[2368];
                    gout7 += gx[384] * gx[1600] * gx[2048];
                    gout8 += gx[320] * gx[1536] * gx[2176];
                    gout9 += gx[256] * gx[1536] * gx[2240];
                    gout10 += gx[64] * gx[1920] * gx[2048];
                    gout11 += gx[0] * gx[1920] * gx[2112];
                    gout12 += gx[128] * gx[1600] * gx[2304];
                    gout13 += gx[64] * gx[1536] * gx[2432];
                    gout14 += gx[0] * gx[1536] * gx[2496];
                    gout15 += gx[320] * gx[1152] * gx[2560];
                    gout16 += gx[256] * gx[1152] * gx[2624];
                    gout17 += gx[128] * gx[1344] * gx[2560];
                    gout18 += gx[64] * gx[1280] * gx[2688];
                    gout19 += gx[0] * gx[1280] * gx[2752];
                    gout20 += gx[64] * gx[1152] * gx[2816];
                    gout21 += gx[0] * gx[1152] * gx[2880];
                    break;
                    }
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                switch (gout_id) {
                case 0:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout5 * dm[(j0+2)*nao+(k0+0)];
                val += gout20 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout18 * dm[(j0+1)*nao+(k0+2)];
                val += gout13 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                val += gout14 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+0)*nao+(k0+2)];
                val += gout12 * dm[(j0+1)*nao+(k0+1)];
                val += gout7 * dm[(j0+2)*nao+(k0+0)];
                val += gout22 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout15 * dm[(i0+0)*nao+(k0+2)];
                val += gout8 * dm[(i0+2)*nao+(k0+1)];
                val += gout1 * dm[(i0+4)*nao+(k0+0)];
                val += gout16 * dm[(i0+4)*nao+(k0+2)];
                val += gout9 * dm[(i0+6)*nao+(k0+1)];
                val += gout2 * dm[(i0+8)*nao+(k0+0)];
                val += gout17 * dm[(i0+8)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(k0+1)];
                val += gout3 * dm[(i0+2)*nao+(k0+0)];
                val += gout18 * dm[(i0+2)*nao+(k0+2)];
                val += gout11 * dm[(i0+4)*nao+(k0+1)];
                val += gout4 * dm[(i0+6)*nao+(k0+0)];
                val += gout19 * dm[(i0+6)*nao+(k0+2)];
                val += gout12 * dm[(i0+8)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(i0+0)*nao+(k0+0)];
                val += gout20 * dm[(i0+0)*nao+(k0+2)];
                val += gout13 * dm[(i0+2)*nao+(k0+1)];
                val += gout6 * dm[(i0+4)*nao+(k0+0)];
                val += gout21 * dm[(i0+4)*nao+(k0+2)];
                val += gout14 * dm[(i0+6)*nao+(k0+1)];
                val += gout7 * dm[(i0+8)*nao+(k0+0)];
                val += gout22 * dm[(i0+8)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout5 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+4)*nao+(l0+0)];
                val += gout2 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(i0+2)*nao+(l0+0)];
                val += gout9 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(i0+0)*nao+(l0+0)];
                val += gout16 * dm[(i0+4)*nao+(l0+0)];
                val += gout17 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+2)*nao+(l0+0)];
                val += gout4 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+4)*nao+(l0+0)];
                val += gout12 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+2)*nao+(l0+0)];
                val += gout19 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+0)*nao+(l0+0)];
                val += gout6 * dm[(i0+4)*nao+(l0+0)];
                val += gout7 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout13 * dm[(i0+2)*nao+(l0+0)];
                val += gout14 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(l0+0)];
                val += gout21 * dm[(i0+4)*nao+(l0+0)];
                val += gout22 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 1:
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout5 * dm[(j0+2)*nao+(k0+0)];
                val += gout20 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout18 * dm[(j0+1)*nao+(k0+2)];
                val += gout13 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                val += gout14 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+0)*nao+(k0+2)];
                val += gout12 * dm[(j0+1)*nao+(k0+1)];
                val += gout7 * dm[(j0+2)*nao+(k0+0)];
                val += gout22 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(k0+0)];
                val += gout15 * dm[(i0+1)*nao+(k0+2)];
                val += gout8 * dm[(i0+3)*nao+(k0+1)];
                val += gout1 * dm[(i0+5)*nao+(k0+0)];
                val += gout16 * dm[(i0+5)*nao+(k0+2)];
                val += gout9 * dm[(i0+7)*nao+(k0+1)];
                val += gout2 * dm[(i0+9)*nao+(k0+0)];
                val += gout17 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(k0+1)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout18 * dm[(i0+3)*nao+(k0+2)];
                val += gout11 * dm[(i0+5)*nao+(k0+1)];
                val += gout4 * dm[(i0+7)*nao+(k0+0)];
                val += gout19 * dm[(i0+7)*nao+(k0+2)];
                val += gout12 * dm[(i0+9)*nao+(k0+1)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(i0+1)*nao+(k0+0)];
                val += gout20 * dm[(i0+1)*nao+(k0+2)];
                val += gout13 * dm[(i0+3)*nao+(k0+1)];
                val += gout6 * dm[(i0+5)*nao+(k0+0)];
                val += gout21 * dm[(i0+5)*nao+(k0+2)];
                val += gout14 * dm[(i0+7)*nao+(k0+1)];
                val += gout7 * dm[(i0+9)*nao+(k0+0)];
                val += gout22 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout5 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout7 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+0)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+1)*nao+(l0+0)];
                val += gout1 * dm[(i0+5)*nao+(l0+0)];
                val += gout2 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(i0+3)*nao+(l0+0)];
                val += gout9 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(i0+1)*nao+(l0+0)];
                val += gout16 * dm[(i0+5)*nao+(l0+0)];
                val += gout17 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+1)*nao+(l0+0)];
                val += gout11 * dm[(i0+5)*nao+(l0+0)];
                val += gout12 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(i0+3)*nao+(l0+0)];
                val += gout19 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+1)*nao+(l0+0)];
                val += gout6 * dm[(i0+5)*nao+(l0+0)];
                val += gout7 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout13 * dm[(i0+3)*nao+(l0+0)];
                val += gout14 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+1)*nao+(l0+0)];
                val += gout21 * dm[(i0+5)*nao+(l0+0)];
                val += gout22 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 2:
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+1)];
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+1)*nao+(k0+2)];
                val += gout12 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout5 * dm[(j0+2)*nao+(k0+0)];
                val += gout20 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout18 * dm[(j0+1)*nao+(k0+2)];
                val += gout13 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                val += gout14 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(k0+1)];
                val += gout0 * dm[(i0+2)*nao+(k0+0)];
                val += gout15 * dm[(i0+2)*nao+(k0+2)];
                val += gout8 * dm[(i0+4)*nao+(k0+1)];
                val += gout1 * dm[(i0+6)*nao+(k0+0)];
                val += gout16 * dm[(i0+6)*nao+(k0+2)];
                val += gout9 * dm[(i0+8)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(i0+0)*nao+(k0+0)];
                val += gout17 * dm[(i0+0)*nao+(k0+2)];
                val += gout10 * dm[(i0+2)*nao+(k0+1)];
                val += gout3 * dm[(i0+4)*nao+(k0+0)];
                val += gout18 * dm[(i0+4)*nao+(k0+2)];
                val += gout11 * dm[(i0+6)*nao+(k0+1)];
                val += gout4 * dm[(i0+8)*nao+(k0+0)];
                val += gout19 * dm[(i0+8)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(k0+1)];
                val += gout5 * dm[(i0+2)*nao+(k0+0)];
                val += gout20 * dm[(i0+2)*nao+(k0+2)];
                val += gout13 * dm[(i0+4)*nao+(k0+1)];
                val += gout6 * dm[(i0+6)*nao+(k0+0)];
                val += gout21 * dm[(i0+6)*nao+(k0+2)];
                val += gout14 * dm[(i0+8)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout5 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+2)*nao+(l0+0)];
                val += gout1 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(i0+0)*nao+(l0+0)];
                val += gout8 * dm[(i0+4)*nao+(l0+0)];
                val += gout9 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(i0+2)*nao+(l0+0)];
                val += gout16 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(i0+0)*nao+(l0+0)];
                val += gout3 * dm[(i0+4)*nao+(l0+0)];
                val += gout4 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+2)*nao+(l0+0)];
                val += gout11 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(i0+0)*nao+(l0+0)];
                val += gout18 * dm[(i0+4)*nao+(l0+0)];
                val += gout19 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+2)*nao+(l0+0)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+0)*nao+(l0+0)];
                val += gout13 * dm[(i0+4)*nao+(l0+0)];
                val += gout14 * dm[(i0+8)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+2)*nao+(l0+0)];
                val += gout21 * dm[(i0+6)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                case 3:
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+1)];
                val += gout2 * dm[(j0+1)*nao+(k0+0)];
                val += gout17 * dm[(j0+1)*nao+(k0+2)];
                val += gout12 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+0)*nao+(k0+2)];
                val += gout10 * dm[(j0+1)*nao+(k0+1)];
                val += gout5 * dm[(j0+2)*nao+(k0+0)];
                val += gout20 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+1)];
                val += gout3 * dm[(j0+1)*nao+(k0+0)];
                val += gout18 * dm[(j0+1)*nao+(k0+2)];
                val += gout13 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+0)*nao+(k0+2)];
                val += gout11 * dm[(j0+1)*nao+(k0+1)];
                val += gout6 * dm[(j0+2)*nao+(k0+0)];
                val += gout21 * dm[(j0+2)*nao+(k0+2)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+1)];
                val += gout4 * dm[(j0+1)*nao+(k0+0)];
                val += gout19 * dm[(j0+1)*nao+(k0+2)];
                val += gout14 * dm[(j0+2)*nao+(k0+1)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(k0+1)];
                val += gout0 * dm[(i0+3)*nao+(k0+0)];
                val += gout15 * dm[(i0+3)*nao+(k0+2)];
                val += gout8 * dm[(i0+5)*nao+(k0+1)];
                val += gout1 * dm[(i0+7)*nao+(k0+0)];
                val += gout16 * dm[(i0+7)*nao+(k0+2)];
                val += gout9 * dm[(i0+9)*nao+(k0+1)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(i0+1)*nao+(k0+0)];
                val += gout17 * dm[(i0+1)*nao+(k0+2)];
                val += gout10 * dm[(i0+3)*nao+(k0+1)];
                val += gout3 * dm[(i0+5)*nao+(k0+0)];
                val += gout18 * dm[(i0+5)*nao+(k0+2)];
                val += gout11 * dm[(i0+7)*nao+(k0+1)];
                val += gout4 * dm[(i0+9)*nao+(k0+0)];
                val += gout19 * dm[(i0+9)*nao+(k0+2)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(k0+1)];
                val += gout5 * dm[(i0+3)*nao+(k0+0)];
                val += gout20 * dm[(i0+3)*nao+(k0+2)];
                val += gout13 * dm[(i0+5)*nao+(k0+1)];
                val += gout6 * dm[(i0+7)*nao+(k0+0)];
                val += gout21 * dm[(i0+7)*nao+(k0+2)];
                val += gout14 * dm[(i0+9)*nao+(k0+1)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout5 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(j0+0)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+2), val);
                val = 0;
                val += gout3 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+1), val);
                val = 0;
                val += gout18 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+2), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout6 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+1), val);
                val = 0;
                val += gout16 * dm[(j0+0)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+2), val);
                val = 0;
                val += gout4 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+2)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+1), val);
                val = 0;
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+2), val);
                val = 0;
                val += gout0 * dm[(i0+3)*nao+(l0+0)];
                val += gout1 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(i0+1)*nao+(l0+0)];
                val += gout8 * dm[(i0+5)*nao+(l0+0)];
                val += gout9 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+1), val);
                val = 0;
                val += gout15 * dm[(i0+3)*nao+(l0+0)];
                val += gout16 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+2), val);
                val = 0;
                val += gout2 * dm[(i0+1)*nao+(l0+0)];
                val += gout3 * dm[(i0+5)*nao+(l0+0)];
                val += gout4 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+3)*nao+(l0+0)];
                val += gout11 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+1), val);
                val = 0;
                val += gout17 * dm[(i0+1)*nao+(l0+0)];
                val += gout18 * dm[(i0+5)*nao+(l0+0)];
                val += gout19 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+2), val);
                val = 0;
                val += gout5 * dm[(i0+3)*nao+(l0+0)];
                val += gout6 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout12 * dm[(i0+1)*nao+(l0+0)];
                val += gout13 * dm[(i0+5)*nao+(l0+0)];
                val += gout14 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+1), val);
                val = 0;
                val += gout20 * dm[(i0+3)*nao+(l0+0)];
                val += gout21 * dm[(i0+7)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+2), val);
                break;
                }
            }
        }
    }
    if (thread_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

__global__ static
void rys_k_3200(RysIntEnvVars envs, JKMatrix kmat, BoundsInfo bounds, int *pool, int *head)
{
    int sq_id = threadIdx.x;
    int nsq_per_block = blockDim.x;
    int *bas_kl_idx = pool + blockIdx.x * QUEUE_DEPTH;
    __shared__ int ntasks, pair_ij;
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
while (pair_ij < bounds.npairs_ij) {
    int bas_ij = bounds.pair_ij_mapping[pair_ij];
    if (sq_id == 0) {
        ntasks = 0;
    }
    __syncthreads();
    if (kmat.lr_factor != 0) {
        _fill_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    } else {
        _fill_sr_vk_tasks(&ntasks, bas_kl_idx, bas_ij, envs, bounds);
    }
    if (ntasks == 0) {
        if (sq_id == 0) {
            pair_ij = atomicAdd(head, 1);
        }
        __syncthreads();
        continue;
    }

    extern __shared__ double shared_memory[];
    double *rw = shared_memory + sq_id;
    double *cicj_cache = shared_memory + nsq_per_block * bounds.nroots*2;

    int nbas = envs.nbas;
    int *bas = envs.bas;
    double *env = envs.env;
    __shared__ int ish;
    __shared__ int jsh;
    __shared__ double ri[3];
    __shared__ double rjri[3];
    __shared__ double *expi;
    __shared__ double *expj;
    if (sq_id == 0) {
        ish = bas_ij / nbas;
        jsh = bas_ij % nbas;
        expi = env + bas[ish*BAS_SLOTS+PTR_EXP];
        expj = env + bas[jsh*BAS_SLOTS+PTR_EXP];
    }
    if (sq_id < 3) {
        int ri_ptr = bas[ish*BAS_SLOTS+PTR_BAS_COORD];
        int rj_ptr = bas[jsh*BAS_SLOTS+PTR_BAS_COORD];
        ri[sq_id] = env[ri_ptr+sq_id];
        rjri[sq_id] = env[rj_ptr+sq_id] - ri[sq_id];
    }
    __syncthreads();
    int iprim = bounds.iprim;
    int jprim = bounds.jprim;
    double *ci = env + bas[ish*BAS_SLOTS+PTR_COEFF];
    double *cj = env + bas[jsh*BAS_SLOTS+PTR_COEFF];
    double xjxi = rjri[0];
    double yjyi = rjri[1];
    double zjzi = rjri[2];
    for (int ij = sq_id; ij < iprim*jprim; ij += nsq_per_block) {
        int ip = ij / jprim;
        int jp = ij % jprim;
        double ai = expi[ip];
        double aj = expj[jp];
        double aij = ai + aj;
        double theta_ij = ai * aj / aij;
        double rr_ij = xjxi*xjxi + yjyi*yjyi + zjzi*zjzi;
        double Kab = exp(-theta_ij * rr_ij);
        cicj_cache[ij] = ci[ip] * cj[jp] * Kab;
    }
    for (int task_id = sq_id; task_id < ntasks+sq_id; task_id += nsq_per_block) {
        __syncthreads();
        int kprim = bounds.kprim;
        int lprim = bounds.lprim;
        int bas_kl = bas_kl_idx[task_id];
        int ksh = bas_kl / nbas;
        int lsh = bas_kl % nbas;
        double fac_sym = PI_FAC;
        if (task_id < ntasks) {
            if (ish == jsh) fac_sym *= .5;
            if (ksh == lsh) fac_sym *= .5;
            if (ish*nbas+jsh == bas_kl) fac_sym *= .5;
        } else {
            fac_sym = 0;
        }
        double *expk = env + bas[ksh*BAS_SLOTS+PTR_EXP];
        double *expl = env + bas[lsh*BAS_SLOTS+PTR_EXP];
        double *ck = env + bas[ksh*BAS_SLOTS+PTR_COEFF];
        double *cl = env + bas[lsh*BAS_SLOTS+PTR_COEFF];
        double *rk = env + bas[ksh*BAS_SLOTS+PTR_BAS_COORD];
        double *rl = env + bas[lsh*BAS_SLOTS+PTR_BAS_COORD];
        double gout0;
        double gout1;
        double gout2;
        double gout3;
        double gout4;
        double gout5;
        double gout6;
        double gout7;
        double gout8;
        double gout9;
        double gout10;
        double gout11;
        double gout12;
        double gout13;
        double gout14;
        double gout15;
        double gout16;
        double gout17;
        double gout18;
        double gout19;
        double gout20;
        double gout21;
        double gout22;
        double gout23;
        double gout24;
        double gout25;
        double gout26;
        double gout27;
        double gout28;
        double gout29;
        double gout30;
        double gout31;
        double gout32;
        double gout33;
        double gout34;
        double gout35;
        double gout36;
        double gout37;
        double gout38;
        double gout39;
        double gout40;
        double gout41;
        double gout42;
        double gout43;
        double gout44;
        double gout45;
        double gout46;
        double gout47;
        double gout48;
        double gout49;
        double gout50;
        double gout51;
        double gout52;
        double gout53;
        double gout54;
        double gout55;
        double gout56;
        double gout57;
        double gout58;
        double gout59;
        
        gout0 = 0;
        gout1 = 0;
        gout2 = 0;
        gout3 = 0;
        gout4 = 0;
        gout5 = 0;
        gout6 = 0;
        gout7 = 0;
        gout8 = 0;
        gout9 = 0;
        gout10 = 0;
        gout11 = 0;
        gout12 = 0;
        gout13 = 0;
        gout14 = 0;
        gout15 = 0;
        gout16 = 0;
        gout17 = 0;
        gout18 = 0;
        gout19 = 0;
        gout20 = 0;
        gout21 = 0;
        gout22 = 0;
        gout23 = 0;
        gout24 = 0;
        gout25 = 0;
        gout26 = 0;
        gout27 = 0;
        gout28 = 0;
        gout29 = 0;
        gout30 = 0;
        gout31 = 0;
        gout32 = 0;
        gout33 = 0;
        gout34 = 0;
        gout35 = 0;
        gout36 = 0;
        gout37 = 0;
        gout38 = 0;
        gout39 = 0;
        gout40 = 0;
        gout41 = 0;
        gout42 = 0;
        gout43 = 0;
        gout44 = 0;
        gout45 = 0;
        gout46 = 0;
        gout47 = 0;
        gout48 = 0;
        gout49 = 0;
        gout50 = 0;
        gout51 = 0;
        gout52 = 0;
        gout53 = 0;
        gout54 = 0;
        gout55 = 0;
        gout56 = 0;
        gout57 = 0;
        gout58 = 0;
        gout59 = 0;
        for (int klp = 0; klp < kprim*lprim; ++klp) {
            int kp = klp / lprim;
            int lp = klp % lprim;
            double ak = expk[kp];
            double al = expl[lp];
            double akl = ak + al;
            double al_akl = al / akl;
            double xlxk = rl[0] - rk[0];
            double ylyk = rl[1] - rk[1];
            double zlzk = rl[2] - rk[2];
            double theta_kl = ak * al_akl;
            double Kcd = exp(-theta_kl * (xlxk*xlxk+ylyk*ylyk+zlzk*zlzk));
            double ckcl = fac_sym * ck[kp] * cl[lp] * Kcd;
            for (int ijp = 0; ijp < iprim*jprim; ++ijp) {
                __syncthreads();
                int ip = ijp / jprim;
                int jp = ijp % jprim;
                double ai = expi[ip];
                double aj = expj[jp];
                double aij = ai + aj;
                double aj_aij = aj / aij;
                double cicj = cicj_cache[ijp];
                double fac = cicj * ckcl / (aij*akl*sqrt(aij+akl));
                double xpa = (rjri[0]) * aj_aij;
                double ypa = (rjri[1]) * aj_aij;
                double zpa = (rjri[2]) * aj_aij;
                double xij = ri[0] + xpa;
                double yij = ri[1] + ypa;
                double zij = ri[2] + zpa;
                double xqc = xlxk * al_akl; // (ak*xk+al*xl)/akl
                double yqc = ylyk * al_akl;
                double zqc = zlzk * al_akl;
                double xkl = rk[0] + xqc;
                double ykl = rk[1] + yqc;
                double zkl = rk[2] + zqc;
                double xpq = xij - xkl;
                double ypq = yij - ykl;
                double zpq = zij - zkl;
                double theta = aij * akl / (aij + akl);
                double rr = xpq * xpq + ypq * ypq + zpq * zpq;
                int nroots = bounds.nroots;
                rys_roots_for_k(nroots, theta, rr, rw, kmat.omega, kmat.lr_factor, kmat.sr_factor);
                if (task_id >= ntasks) {
                    continue;
                }
                for (int irys = 0; irys < nroots; ++irys) {
                    double wt = rw[(2*irys+1)*nsq_per_block];
                    double rt = rw[ 2*irys   *nsq_per_block];
                    double rt_aa = rt / (aij + akl);
                    double xjxi = rjri[0];
                    double rt_aij = rt_aa * akl;
                    double b10 = .5/aij * (1 - rt_aij);
                    double c0x = xjxi*aj_aij - xpq*rt_aij;
                    double trr_10x = c0x * 1;
                    double trr_20x = c0x * trr_10x + 1*b10 * 1;
                    double trr_30x = c0x * trr_20x + 2*b10 * trr_10x;
                    double trr_40x = c0x * trr_30x + 3*b10 * trr_20x;
                    double trr_50x = c0x * trr_40x + 4*b10 * trr_30x;
                    double hrr_4100x = trr_50x - xjxi * trr_40x;
                    double hrr_3100x = trr_40x - xjxi * trr_30x;
                    double hrr_3200x = hrr_4100x - xjxi * hrr_3100x;
                    gout0 += hrr_3200x * fac * wt;
                    double hrr_2100x = trr_30x - xjxi * trr_20x;
                    double hrr_2200x = hrr_3100x - xjxi * hrr_2100x;
                    double yjyi = rjri[1];
                    double c0y = yjyi*aj_aij - ypq*rt_aij;
                    double trr_10y = c0y * fac;
                    gout1 += hrr_2200x * trr_10y * wt;
                    double zjzi = rjri[2];
                    double c0z = zjzi*aj_aij - zpq*rt_aij;
                    double trr_10z = c0z * wt;
                    gout2 += hrr_2200x * fac * trr_10z;
                    double hrr_1100x = trr_20x - xjxi * trr_10x;
                    double hrr_1200x = hrr_2100x - xjxi * hrr_1100x;
                    double trr_20y = c0y * trr_10y + 1*b10 * fac;
                    gout3 += hrr_1200x * trr_20y * wt;
                    gout4 += hrr_1200x * trr_10y * trr_10z;
                    double trr_20z = c0z * trr_10z + 1*b10 * wt;
                    gout5 += hrr_1200x * fac * trr_20z;
                    double hrr_0100x = trr_10x - xjxi * 1;
                    double hrr_0200x = hrr_1100x - xjxi * hrr_0100x;
                    double trr_30y = c0y * trr_20y + 2*b10 * trr_10y;
                    gout6 += hrr_0200x * trr_30y * wt;
                    gout7 += hrr_0200x * trr_20y * trr_10z;
                    gout8 += hrr_0200x * trr_10y * trr_20z;
                    double trr_30z = c0z * trr_20z + 2*b10 * trr_10z;
                    gout9 += hrr_0200x * fac * trr_30z;
                    double hrr_0100y = trr_10y - yjyi * fac;
                    gout10 += hrr_3100x * hrr_0100y * wt;
                    double hrr_1100y = trr_20y - yjyi * trr_10y;
                    gout11 += hrr_2100x * hrr_1100y * wt;
                    gout12 += hrr_2100x * hrr_0100y * trr_10z;
                    double hrr_2100y = trr_30y - yjyi * trr_20y;
                    gout13 += hrr_1100x * hrr_2100y * wt;
                    gout14 += hrr_1100x * hrr_1100y * trr_10z;
                    gout15 += hrr_1100x * hrr_0100y * trr_20z;
                    double trr_40y = c0y * trr_30y + 3*b10 * trr_20y;
                    double hrr_3100y = trr_40y - yjyi * trr_30y;
                    gout16 += hrr_0100x * hrr_3100y * wt;
                    gout17 += hrr_0100x * hrr_2100y * trr_10z;
                    gout18 += hrr_0100x * hrr_1100y * trr_20z;
                    gout19 += hrr_0100x * hrr_0100y * trr_30z;
                    double hrr_0100z = trr_10z - zjzi * wt;
                    gout20 += hrr_3100x * fac * hrr_0100z;
                    gout21 += hrr_2100x * trr_10y * hrr_0100z;
                    double hrr_1100z = trr_20z - zjzi * trr_10z;
                    gout22 += hrr_2100x * fac * hrr_1100z;
                    gout23 += hrr_1100x * trr_20y * hrr_0100z;
                    gout24 += hrr_1100x * trr_10y * hrr_1100z;
                    double hrr_2100z = trr_30z - zjzi * trr_20z;
                    gout25 += hrr_1100x * fac * hrr_2100z;
                    gout26 += hrr_0100x * trr_30y * hrr_0100z;
                    gout27 += hrr_0100x * trr_20y * hrr_1100z;
                    gout28 += hrr_0100x * trr_10y * hrr_2100z;
                    double trr_40z = c0z * trr_30z + 3*b10 * trr_20z;
                    double hrr_3100z = trr_40z - zjzi * trr_30z;
                    gout29 += hrr_0100x * fac * hrr_3100z;
                    double hrr_0200y = hrr_1100y - yjyi * hrr_0100y;
                    gout30 += trr_30x * hrr_0200y * wt;
                    double hrr_1200y = hrr_2100y - yjyi * hrr_1100y;
                    gout31 += trr_20x * hrr_1200y * wt;
                    gout32 += trr_20x * hrr_0200y * trr_10z;
                    double hrr_2200y = hrr_3100y - yjyi * hrr_2100y;
                    gout33 += trr_10x * hrr_2200y * wt;
                    gout34 += trr_10x * hrr_1200y * trr_10z;
                    gout35 += trr_10x * hrr_0200y * trr_20z;
                    double trr_50y = c0y * trr_40y + 4*b10 * trr_30y;
                    double hrr_4100y = trr_50y - yjyi * trr_40y;
                    double hrr_3200y = hrr_4100y - yjyi * hrr_3100y;
                    gout36 += 1 * hrr_3200y * wt;
                    gout37 += 1 * hrr_2200y * trr_10z;
                    gout38 += 1 * hrr_1200y * trr_20z;
                    gout39 += 1 * hrr_0200y * trr_30z;
                    gout40 += trr_30x * hrr_0100y * hrr_0100z;
                    gout41 += trr_20x * hrr_1100y * hrr_0100z;
                    gout42 += trr_20x * hrr_0100y * hrr_1100z;
                    gout43 += trr_10x * hrr_2100y * hrr_0100z;
                    gout44 += trr_10x * hrr_1100y * hrr_1100z;
                    gout45 += trr_10x * hrr_0100y * hrr_2100z;
                    gout46 += 1 * hrr_3100y * hrr_0100z;
                    gout47 += 1 * hrr_2100y * hrr_1100z;
                    gout48 += 1 * hrr_1100y * hrr_2100z;
                    gout49 += 1 * hrr_0100y * hrr_3100z;
                    double hrr_0200z = hrr_1100z - zjzi * hrr_0100z;
                    gout50 += trr_30x * fac * hrr_0200z;
                    gout51 += trr_20x * trr_10y * hrr_0200z;
                    double hrr_1200z = hrr_2100z - zjzi * hrr_1100z;
                    gout52 += trr_20x * fac * hrr_1200z;
                    gout53 += trr_10x * trr_20y * hrr_0200z;
                    gout54 += trr_10x * trr_10y * hrr_1200z;
                    double hrr_2200z = hrr_3100z - zjzi * hrr_2100z;
                    gout55 += trr_10x * fac * hrr_2200z;
                    gout56 += 1 * trr_30y * hrr_0200z;
                    gout57 += 1 * trr_20y * hrr_1200z;
                    gout58 += 1 * trr_10y * hrr_2200z;
                    double trr_50z = c0z * trr_40z + 4*b10 * trr_30z;
                    double hrr_4100z = trr_50z - zjzi * trr_40z;
                    double hrr_3200z = hrr_4100z - zjzi * hrr_3100z;
                    gout59 += 1 * fac * hrr_3200z;
                }
            }
        }
        if (task_id < ntasks) {
            int *ao_loc = envs.ao_loc;
            int nao = ao_loc[nbas];
            int i0 = ao_loc[ish];
            int j0 = ao_loc[jsh];
            int k0 = ao_loc[ksh];
            int l0 = ao_loc[lsh];
            double val;
            for (int i_dm = 0; i_dm < kmat.n_dm; ++i_dm) {
                double *dm = kmat.dm + i_dm * nao * nao;
                double *vk = kmat.vk + i_dm * nao * nao;
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(k0+0)];
                val += gout10 * dm[(j0+1)*nao+(k0+0)];
                val += gout20 * dm[(j0+2)*nao+(k0+0)];
                val += gout30 * dm[(j0+3)*nao+(k0+0)];
                val += gout40 * dm[(j0+4)*nao+(k0+0)];
                val += gout50 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+0)*nao+(l0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(k0+0)];
                val += gout11 * dm[(j0+1)*nao+(k0+0)];
                val += gout21 * dm[(j0+2)*nao+(k0+0)];
                val += gout31 * dm[(j0+3)*nao+(k0+0)];
                val += gout41 * dm[(j0+4)*nao+(k0+0)];
                val += gout51 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+1)*nao+(l0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(k0+0)];
                val += gout12 * dm[(j0+1)*nao+(k0+0)];
                val += gout22 * dm[(j0+2)*nao+(k0+0)];
                val += gout32 * dm[(j0+3)*nao+(k0+0)];
                val += gout42 * dm[(j0+4)*nao+(k0+0)];
                val += gout52 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+2)*nao+(l0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(k0+0)];
                val += gout13 * dm[(j0+1)*nao+(k0+0)];
                val += gout23 * dm[(j0+2)*nao+(k0+0)];
                val += gout33 * dm[(j0+3)*nao+(k0+0)];
                val += gout43 * dm[(j0+4)*nao+(k0+0)];
                val += gout53 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+3)*nao+(l0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(k0+0)];
                val += gout14 * dm[(j0+1)*nao+(k0+0)];
                val += gout24 * dm[(j0+2)*nao+(k0+0)];
                val += gout34 * dm[(j0+3)*nao+(k0+0)];
                val += gout44 * dm[(j0+4)*nao+(k0+0)];
                val += gout54 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+4)*nao+(l0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(k0+0)];
                val += gout15 * dm[(j0+1)*nao+(k0+0)];
                val += gout25 * dm[(j0+2)*nao+(k0+0)];
                val += gout35 * dm[(j0+3)*nao+(k0+0)];
                val += gout45 * dm[(j0+4)*nao+(k0+0)];
                val += gout55 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+5)*nao+(l0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(k0+0)];
                val += gout16 * dm[(j0+1)*nao+(k0+0)];
                val += gout26 * dm[(j0+2)*nao+(k0+0)];
                val += gout36 * dm[(j0+3)*nao+(k0+0)];
                val += gout46 * dm[(j0+4)*nao+(k0+0)];
                val += gout56 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+6)*nao+(l0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(k0+0)];
                val += gout17 * dm[(j0+1)*nao+(k0+0)];
                val += gout27 * dm[(j0+2)*nao+(k0+0)];
                val += gout37 * dm[(j0+3)*nao+(k0+0)];
                val += gout47 * dm[(j0+4)*nao+(k0+0)];
                val += gout57 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+7)*nao+(l0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(k0+0)];
                val += gout18 * dm[(j0+1)*nao+(k0+0)];
                val += gout28 * dm[(j0+2)*nao+(k0+0)];
                val += gout38 * dm[(j0+3)*nao+(k0+0)];
                val += gout48 * dm[(j0+4)*nao+(k0+0)];
                val += gout58 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+8)*nao+(l0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(k0+0)];
                val += gout19 * dm[(j0+1)*nao+(k0+0)];
                val += gout29 * dm[(j0+2)*nao+(k0+0)];
                val += gout39 * dm[(j0+3)*nao+(k0+0)];
                val += gout49 * dm[(j0+4)*nao+(k0+0)];
                val += gout59 * dm[(j0+5)*nao+(k0+0)];
                atomicAdd(vk+(i0+9)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(k0+0)];
                val += gout1 * dm[(i0+1)*nao+(k0+0)];
                val += gout2 * dm[(i0+2)*nao+(k0+0)];
                val += gout3 * dm[(i0+3)*nao+(k0+0)];
                val += gout4 * dm[(i0+4)*nao+(k0+0)];
                val += gout5 * dm[(i0+5)*nao+(k0+0)];
                val += gout6 * dm[(i0+6)*nao+(k0+0)];
                val += gout7 * dm[(i0+7)*nao+(k0+0)];
                val += gout8 * dm[(i0+8)*nao+(k0+0)];
                val += gout9 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+0)*nao+(l0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(k0+0)];
                val += gout11 * dm[(i0+1)*nao+(k0+0)];
                val += gout12 * dm[(i0+2)*nao+(k0+0)];
                val += gout13 * dm[(i0+3)*nao+(k0+0)];
                val += gout14 * dm[(i0+4)*nao+(k0+0)];
                val += gout15 * dm[(i0+5)*nao+(k0+0)];
                val += gout16 * dm[(i0+6)*nao+(k0+0)];
                val += gout17 * dm[(i0+7)*nao+(k0+0)];
                val += gout18 * dm[(i0+8)*nao+(k0+0)];
                val += gout19 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+1)*nao+(l0+0), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(k0+0)];
                val += gout21 * dm[(i0+1)*nao+(k0+0)];
                val += gout22 * dm[(i0+2)*nao+(k0+0)];
                val += gout23 * dm[(i0+3)*nao+(k0+0)];
                val += gout24 * dm[(i0+4)*nao+(k0+0)];
                val += gout25 * dm[(i0+5)*nao+(k0+0)];
                val += gout26 * dm[(i0+6)*nao+(k0+0)];
                val += gout27 * dm[(i0+7)*nao+(k0+0)];
                val += gout28 * dm[(i0+8)*nao+(k0+0)];
                val += gout29 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+2)*nao+(l0+0), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(k0+0)];
                val += gout31 * dm[(i0+1)*nao+(k0+0)];
                val += gout32 * dm[(i0+2)*nao+(k0+0)];
                val += gout33 * dm[(i0+3)*nao+(k0+0)];
                val += gout34 * dm[(i0+4)*nao+(k0+0)];
                val += gout35 * dm[(i0+5)*nao+(k0+0)];
                val += gout36 * dm[(i0+6)*nao+(k0+0)];
                val += gout37 * dm[(i0+7)*nao+(k0+0)];
                val += gout38 * dm[(i0+8)*nao+(k0+0)];
                val += gout39 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+3)*nao+(l0+0), val);
                val = 0;
                val += gout40 * dm[(i0+0)*nao+(k0+0)];
                val += gout41 * dm[(i0+1)*nao+(k0+0)];
                val += gout42 * dm[(i0+2)*nao+(k0+0)];
                val += gout43 * dm[(i0+3)*nao+(k0+0)];
                val += gout44 * dm[(i0+4)*nao+(k0+0)];
                val += gout45 * dm[(i0+5)*nao+(k0+0)];
                val += gout46 * dm[(i0+6)*nao+(k0+0)];
                val += gout47 * dm[(i0+7)*nao+(k0+0)];
                val += gout48 * dm[(i0+8)*nao+(k0+0)];
                val += gout49 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+4)*nao+(l0+0), val);
                val = 0;
                val += gout50 * dm[(i0+0)*nao+(k0+0)];
                val += gout51 * dm[(i0+1)*nao+(k0+0)];
                val += gout52 * dm[(i0+2)*nao+(k0+0)];
                val += gout53 * dm[(i0+3)*nao+(k0+0)];
                val += gout54 * dm[(i0+4)*nao+(k0+0)];
                val += gout55 * dm[(i0+5)*nao+(k0+0)];
                val += gout56 * dm[(i0+6)*nao+(k0+0)];
                val += gout57 * dm[(i0+7)*nao+(k0+0)];
                val += gout58 * dm[(i0+8)*nao+(k0+0)];
                val += gout59 * dm[(i0+9)*nao+(k0+0)];
                atomicAdd(vk+(j0+5)*nao+(l0+0), val);
                val = 0;
                val += gout0 * dm[(j0+0)*nao+(l0+0)];
                val += gout10 * dm[(j0+1)*nao+(l0+0)];
                val += gout20 * dm[(j0+2)*nao+(l0+0)];
                val += gout30 * dm[(j0+3)*nao+(l0+0)];
                val += gout40 * dm[(j0+4)*nao+(l0+0)];
                val += gout50 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+0)*nao+(k0+0), val);
                val = 0;
                val += gout1 * dm[(j0+0)*nao+(l0+0)];
                val += gout11 * dm[(j0+1)*nao+(l0+0)];
                val += gout21 * dm[(j0+2)*nao+(l0+0)];
                val += gout31 * dm[(j0+3)*nao+(l0+0)];
                val += gout41 * dm[(j0+4)*nao+(l0+0)];
                val += gout51 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+1)*nao+(k0+0), val);
                val = 0;
                val += gout2 * dm[(j0+0)*nao+(l0+0)];
                val += gout12 * dm[(j0+1)*nao+(l0+0)];
                val += gout22 * dm[(j0+2)*nao+(l0+0)];
                val += gout32 * dm[(j0+3)*nao+(l0+0)];
                val += gout42 * dm[(j0+4)*nao+(l0+0)];
                val += gout52 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+2)*nao+(k0+0), val);
                val = 0;
                val += gout3 * dm[(j0+0)*nao+(l0+0)];
                val += gout13 * dm[(j0+1)*nao+(l0+0)];
                val += gout23 * dm[(j0+2)*nao+(l0+0)];
                val += gout33 * dm[(j0+3)*nao+(l0+0)];
                val += gout43 * dm[(j0+4)*nao+(l0+0)];
                val += gout53 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+3)*nao+(k0+0), val);
                val = 0;
                val += gout4 * dm[(j0+0)*nao+(l0+0)];
                val += gout14 * dm[(j0+1)*nao+(l0+0)];
                val += gout24 * dm[(j0+2)*nao+(l0+0)];
                val += gout34 * dm[(j0+3)*nao+(l0+0)];
                val += gout44 * dm[(j0+4)*nao+(l0+0)];
                val += gout54 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+4)*nao+(k0+0), val);
                val = 0;
                val += gout5 * dm[(j0+0)*nao+(l0+0)];
                val += gout15 * dm[(j0+1)*nao+(l0+0)];
                val += gout25 * dm[(j0+2)*nao+(l0+0)];
                val += gout35 * dm[(j0+3)*nao+(l0+0)];
                val += gout45 * dm[(j0+4)*nao+(l0+0)];
                val += gout55 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+5)*nao+(k0+0), val);
                val = 0;
                val += gout6 * dm[(j0+0)*nao+(l0+0)];
                val += gout16 * dm[(j0+1)*nao+(l0+0)];
                val += gout26 * dm[(j0+2)*nao+(l0+0)];
                val += gout36 * dm[(j0+3)*nao+(l0+0)];
                val += gout46 * dm[(j0+4)*nao+(l0+0)];
                val += gout56 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+6)*nao+(k0+0), val);
                val = 0;
                val += gout7 * dm[(j0+0)*nao+(l0+0)];
                val += gout17 * dm[(j0+1)*nao+(l0+0)];
                val += gout27 * dm[(j0+2)*nao+(l0+0)];
                val += gout37 * dm[(j0+3)*nao+(l0+0)];
                val += gout47 * dm[(j0+4)*nao+(l0+0)];
                val += gout57 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+7)*nao+(k0+0), val);
                val = 0;
                val += gout8 * dm[(j0+0)*nao+(l0+0)];
                val += gout18 * dm[(j0+1)*nao+(l0+0)];
                val += gout28 * dm[(j0+2)*nao+(l0+0)];
                val += gout38 * dm[(j0+3)*nao+(l0+0)];
                val += gout48 * dm[(j0+4)*nao+(l0+0)];
                val += gout58 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+8)*nao+(k0+0), val);
                val = 0;
                val += gout9 * dm[(j0+0)*nao+(l0+0)];
                val += gout19 * dm[(j0+1)*nao+(l0+0)];
                val += gout29 * dm[(j0+2)*nao+(l0+0)];
                val += gout39 * dm[(j0+3)*nao+(l0+0)];
                val += gout49 * dm[(j0+4)*nao+(l0+0)];
                val += gout59 * dm[(j0+5)*nao+(l0+0)];
                atomicAdd(vk+(i0+9)*nao+(k0+0), val);
                val = 0;
                val += gout0 * dm[(i0+0)*nao+(l0+0)];
                val += gout1 * dm[(i0+1)*nao+(l0+0)];
                val += gout2 * dm[(i0+2)*nao+(l0+0)];
                val += gout3 * dm[(i0+3)*nao+(l0+0)];
                val += gout4 * dm[(i0+4)*nao+(l0+0)];
                val += gout5 * dm[(i0+5)*nao+(l0+0)];
                val += gout6 * dm[(i0+6)*nao+(l0+0)];
                val += gout7 * dm[(i0+7)*nao+(l0+0)];
                val += gout8 * dm[(i0+8)*nao+(l0+0)];
                val += gout9 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+0)*nao+(k0+0), val);
                val = 0;
                val += gout10 * dm[(i0+0)*nao+(l0+0)];
                val += gout11 * dm[(i0+1)*nao+(l0+0)];
                val += gout12 * dm[(i0+2)*nao+(l0+0)];
                val += gout13 * dm[(i0+3)*nao+(l0+0)];
                val += gout14 * dm[(i0+4)*nao+(l0+0)];
                val += gout15 * dm[(i0+5)*nao+(l0+0)];
                val += gout16 * dm[(i0+6)*nao+(l0+0)];
                val += gout17 * dm[(i0+7)*nao+(l0+0)];
                val += gout18 * dm[(i0+8)*nao+(l0+0)];
                val += gout19 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+1)*nao+(k0+0), val);
                val = 0;
                val += gout20 * dm[(i0+0)*nao+(l0+0)];
                val += gout21 * dm[(i0+1)*nao+(l0+0)];
                val += gout22 * dm[(i0+2)*nao+(l0+0)];
                val += gout23 * dm[(i0+3)*nao+(l0+0)];
                val += gout24 * dm[(i0+4)*nao+(l0+0)];
                val += gout25 * dm[(i0+5)*nao+(l0+0)];
                val += gout26 * dm[(i0+6)*nao+(l0+0)];
                val += gout27 * dm[(i0+7)*nao+(l0+0)];
                val += gout28 * dm[(i0+8)*nao+(l0+0)];
                val += gout29 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+2)*nao+(k0+0), val);
                val = 0;
                val += gout30 * dm[(i0+0)*nao+(l0+0)];
                val += gout31 * dm[(i0+1)*nao+(l0+0)];
                val += gout32 * dm[(i0+2)*nao+(l0+0)];
                val += gout33 * dm[(i0+3)*nao+(l0+0)];
                val += gout34 * dm[(i0+4)*nao+(l0+0)];
                val += gout35 * dm[(i0+5)*nao+(l0+0)];
                val += gout36 * dm[(i0+6)*nao+(l0+0)];
                val += gout37 * dm[(i0+7)*nao+(l0+0)];
                val += gout38 * dm[(i0+8)*nao+(l0+0)];
                val += gout39 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+3)*nao+(k0+0), val);
                val = 0;
                val += gout40 * dm[(i0+0)*nao+(l0+0)];
                val += gout41 * dm[(i0+1)*nao+(l0+0)];
                val += gout42 * dm[(i0+2)*nao+(l0+0)];
                val += gout43 * dm[(i0+3)*nao+(l0+0)];
                val += gout44 * dm[(i0+4)*nao+(l0+0)];
                val += gout45 * dm[(i0+5)*nao+(l0+0)];
                val += gout46 * dm[(i0+6)*nao+(l0+0)];
                val += gout47 * dm[(i0+7)*nao+(l0+0)];
                val += gout48 * dm[(i0+8)*nao+(l0+0)];
                val += gout49 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+4)*nao+(k0+0), val);
                val = 0;
                val += gout50 * dm[(i0+0)*nao+(l0+0)];
                val += gout51 * dm[(i0+1)*nao+(l0+0)];
                val += gout52 * dm[(i0+2)*nao+(l0+0)];
                val += gout53 * dm[(i0+3)*nao+(l0+0)];
                val += gout54 * dm[(i0+4)*nao+(l0+0)];
                val += gout55 * dm[(i0+5)*nao+(l0+0)];
                val += gout56 * dm[(i0+6)*nao+(l0+0)];
                val += gout57 * dm[(i0+7)*nao+(l0+0)];
                val += gout58 * dm[(i0+8)*nao+(l0+0)];
                val += gout59 * dm[(i0+9)*nao+(l0+0)];
                atomicAdd(vk+(j0+5)*nao+(k0+0), val);
            }
        }
    }
    if (sq_id == 0) {
        pair_ij = atomicAdd(head, 1);
    }
    __syncthreads();
}
}

int rys_k_unrolled(RysIntEnvVars *envs, JKMatrix *kmat, BoundsInfo *bounds, int *pool)
{
    int li = bounds->li;
    int lj = bounds->lj;
    int lk = bounds->lk;
    int ll = bounds->ll;
    int ijkl = li*125 + lj*25 + lk*5 + ll;
    int nroots = bounds->nroots;
    int nsq_per_block = 256;
    int gout_stride = 1;

    switch (ijkl) {
    case 261:
        nsq_per_block = 64;
        gout_stride = 4;
        break;
    case 281:
        nsq_per_block = 32;
        gout_stride = 8;
        break;
    case 285:
        nsq_per_block = 64;
        gout_stride = 4;
        break;
    case 305:
        nsq_per_block = 64;
        gout_stride = 4;
        break;
    case 381:
        nsq_per_block = 64;
        gout_stride = 4;
        break;
    case 405:
        nsq_per_block = 64;
        gout_stride = 4;
        break;
    }

#if CUDA_VERSION >= 12040
    switch (ijkl) {
    case 0: nsq_per_block *= 2; break;
    case 125: nsq_per_block *= 2; break;
    case 130: nsq_per_block *= 2; break;
    case 150: nsq_per_block *= 2; break;
    case 250: nsq_per_block *= 2; break;
    case 255: nsq_per_block *= 2; break;
    case 275: nsq_per_block *= 2; break;
    case 375: nsq_per_block *= 2; break;
    }
#else
    switch (ijkl) {
    case 0: adjust_threads(rys_k_0000, nsq_per_block); break;
    case 125: adjust_threads(rys_k_1000, nsq_per_block); break;
    case 130: adjust_threads(rys_k_1010, nsq_per_block); break;
    case 150: adjust_threads(rys_k_1100, nsq_per_block); break;
    case 250: adjust_threads(rys_k_2000, nsq_per_block); break;
    case 255: adjust_threads(rys_k_2010, nsq_per_block); break;
    case 275: adjust_threads(rys_k_2100, nsq_per_block); break;
    case 375: adjust_threads(rys_k_3000, nsq_per_block); break;
    }
#endif

    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    int workers = prop.multiProcessorCount;
    int *head = pool + workers * QUEUE_DEPTH;
    cudaMemset(head, 0, sizeof(int));

    dim3 threads(nsq_per_block, gout_stride);
    int iprim = bounds->iprim;
    int jprim = bounds->jprim;
    int buflen = nroots*2 * nsq_per_block + iprim*jprim;
    switch (ijkl) {
    case 0:
        rys_k_0000<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 125:
        rys_k_1000<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 130:
        rys_k_1010<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 131:
        rys_k_1011<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 150:
        rys_k_1100<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 155:
        rys_k_1110<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 156:
        rys_k_1111<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 250:
        rys_k_2000<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 255:
        rys_k_2010<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 256:
        rys_k_2011<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 260:
        rys_k_2020<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 261:
        buflen += 4032;
        rys_k_2021<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 275:
        rys_k_2100<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 280:
        rys_k_2110<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 281:
        buflen += 2592;
        rys_k_2111<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 285:
        buflen += 4032;
        rys_k_2120<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 300:
        rys_k_2200<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 305:
        buflen += 4032;
        rys_k_2210<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 375:
        rys_k_3000<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 380:
        rys_k_3010<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 381:
        buflen += 3648;
        rys_k_3011<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 385:
        rys_k_3020<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 400:
        rys_k_3100<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 405:
        buflen += 3648;
        rys_k_3110<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    case 425:
        rys_k_3200<<<workers, threads, buflen*sizeof(double)>>>(*envs, *kmat, *bounds, pool, head); break;
    default: return 0;
    }
    return 1;
}
